## Lab 4 - Cross-encoder re-ranking

In [2]:
from helper_utils import word_wrap, project_embeddings
import numpy as np
import chromadb
import os
from helper_utils import word_wrap
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [3]:
openai_api_key = os.environ['OPENAI_API_KEY']
embedding_function = OpenAIEmbeddingFunction(api_key=openai_api_key, model_name="text-embedding-3-small")

chroma_collection = chromadb.PersistentClient('microsoft_annual_report_2022').create_collection(
    "microsoft_annual_report_2022", 
    embedding_function=embedding_function,
    get_or_create=True)

chroma_collection.count()

307

# Re-ranking the long tail

In [4]:
query = "What has been the investment in research and development?"
results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

Research and Development Research and development expenses include
payroll, employee benefits, stock -based compensation expense, and
other headcount-related expenses associated with product development.
Research and development expenses also include third-party development
and programming costs, localization costs incurred to translate
software for international markets, and the amortization of purchased
software code and services content. Such costs related to software
development are included in research and development expense until the
point that technological feasibility is reached, which for our software
products, is generally shortly before the products are released to
production. Once technological feasibility is reached, such costs are
capitalized and amortized to cost of revenue over the estimated lives
of the products

We believe our continuing research and product development are not
materially dependent on any single license or other agreement with a
third party relating 

In [6]:
def _cosine_similarity(a, b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def embedding_rerank(query, documents):
    query_embedding = embedding_function([query])[0]
    doc_embeddings = embedding_function(documents)
    return [_cosine_similarity(query_embedding, doc_embedding) for doc_embedding in doc_embeddings]

In [7]:
scores = embedding_rerank(query, retrieved_documents)
print("Scores:")
for score in scores:
    print(score)

Scores:
0.5758931636810303
0.526953399181366
0.5089221596717834
0.5032009482383728
0.4815691113471985
0.47638198733329773
0.4717883765697479
0.44306454062461853
0.43757501244544983
0.43075031042099


In [8]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
1
2
3
4
5
6
7
8
9
10


# Re-ranking with Query Expansion

In [9]:
original_query = "What were the most important factors that contributed to increases in revenue?"
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [10]:
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [11]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [12]:
# Unique documents prepared above; embedding rerank runs directly on them.

In [13]:
scores = embedding_rerank(original_query, unique_documents)

In [14]:
print("Scores:")
for score in scores:
    print(score)

Scores:
0.40258246660232544
0.4279424846172333
0.41656142473220825
0.4491081237792969
0.40405285358428955
0.4580407440662384
0.39477717876434326
0.42988112568855286
0.37401607632637024
0.4755610525608063
0.4336984157562256
0.40853530168533325
0.37585631012916565
0.43725767731666565
0.3604590594768524
0.3891030550003052
0.4317931830883026
0.4182521104812622


In [15]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
9
5
3
13
10
16
7
1
17
2
11
4
0
6
15
12
8
14
