------------------------------
#### Cross-encoder re-ranking¶
--------------------------------

In [1]:
import numpy as np

from helper_utils import load_chroma, word_wrap, project_embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

In [3]:
embedding_function = SentenceTransformerEmbeddingFunction()

chroma_collection = load_chroma(filename          = r'./data/microsoft_annual_report_2022.pdf', 
                                collection_name   = 'microsoft_annual_report_2022', 
                                embedding_function= embedding_function)
chroma_collection.count()

349

In [4]:
query = "What has been the investment in research and development?"

In [5]:
# more number of retrieved documents
results = chroma_collection.query(query_texts= query, 
                                  n_results  = 10, 
                                  include    = ['documents', 'embeddings'])

In [6]:
retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

• operating expenses increased $ 1. 5 billion or 14 % driven by
investments in gaming, search and news advertising, and windows
marketing. operating expenses research and development ( in millions,
except percentages ) 2022 2021 percentage change research and
development $ 24, 512 $ 20, 716 18 % as a percent of revenue 12 % 12 %
0ppt research and development expenses include payroll, employee
benefits, stock - based compensation expense, and other headcount -
related expenses associated with product development. research and
development expenses also include third - party development and
programming costs, localization costs incurred to translate software
for international markets, and the amortization of purchased software
code and services content. research and development expenses increased
$ 3. 8 billion or 18 % driven by investments in cloud engineering,
gaming, and linkedin. sales and marketing

competitive in local markets and enables us to continue to attract top
talent from ac

In [7]:
from sentence_transformers import CrossEncoder

In [8]:
cross_encoder = CrossEncoder(model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2',
                             cache_dir  = r'D:\AI-DATASETS\07-Hugging-Face-Data')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
pairs = [[query, doc] for doc in retrieved_documents]

In [11]:
scores = cross_encoder.predict(pairs)

In [12]:
print("Scores:")
for score in scores:
    print(score)

Scores:
0.9869348
2.6445777
-0.26803175
-10.731592
-7.7066073
-5.646998
-4.2970343
-10.933232
-7.0384297
-7.324692


In [13]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
2
1
3
7
6
9
10
5
4
8


#### Re-ranking with Query Expansion

In [14]:
original_query = "What were the most important factors that contributed to increases in revenue?"

In [15]:
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [16]:
queries = [original_query] + generated_queries

In [17]:
results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents']

In [18]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [19]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [20]:
scores = cross_encoder.predict(pairs)


In [21]:
print("Scores:")
for score in scores:
    print(score)

Scores:
-4.8184824
-11.0792675
-3.794863
-6.902089
-10.0839405
-10.042842
-5.141833
-10.000137
-4.6518893
-10.711211
-7.490653
-7.754099
-4.341766
-9.9184265
-9.807878
-5.274749
-7.917178
-3.7681513
-8.505104
-10.148885
-1.1369991
-9.357724


In [22]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
20
17
2
12
8
0
6
15
3
10
11
16
18
21
14
13
7
5
4
19
9
1


| Column 1 | Column 2 |
|----------|----------|
|    2     |    20    |
|    1     |    17    |
|    3     |    2     |
|    7     |    12    |
|    6     |    8     |
|    9     |    0     |
|   10     |    6     |
|    5     |    15    |
|    4     |    3     |
|    8     |    10    |
|          |    11    |
|          |    16    |
|          |    18    |
|          |    21    |
|          |    14    |
|          |    13    |
|          |    7     |
|          |    5     |
|          |    4     |
|          |    19    |
|          |    9     |
|          |    1     |
