In [1]:
# https://python.langchain.com/docs/modules/data_connection/vectorstores/
# https://python.langchain.com/docs/integrations/vectorstores/annoy

In [2]:
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Annoy, FAISS
from langchain.vectorstores.utils import DistanceStrategy
from tqdm import tqdm

import numpy as np
import os
import pickle

In [3]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [4]:
docs_file_path = './backups/doc_embeddings.pkl'

with open(docs_file_path, 'rb') as file:
    loaded_docs = pickle.load(file)

print("Document embeddings loaded successfully.")

query_file_path = './backups/query_embeddings.pkl'

# Load the query embeddings from the file
with open(query_file_path, 'rb') as file:
    loaded_queries = pickle.load(file)

print("Query embeddings loaded successfully.")

Document embeddings loaded successfully.
Query embeddings loaded successfully.


In [5]:
# def get_text(doc):
#     combined_text = doc["title"] + " " + doc["body"]
#     combined_text = combined_text.replace("\n", " ")
#     return combined_text

### Metrics

In [6]:
def precision_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = 0
    for doc_id, _ in ranked_docs[:k]:
        if doc_id in relevant_docs:
            retrieved_relevant += 1
    return retrieved_relevant / k

def recall_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = sum(1 for doc_id, _ in ranked_docs[:k] if doc_id in relevant_docs)
    return retrieved_relevant / len(relevant_docs) if relevant_docs else 0
    
def dcg_at_k(scores, k=10):
    return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores[:k]))

def ndcg_at_k(ranked_docs, relevant_docs, k=5):
    ideal_scores = [1 if doc_id in relevant_docs else 0 for doc_id, _ in ranked_docs]
    actual_scores = [1 if doc_id in relevant_docs else 0 for doc_id, _ in ranked_docs[:k]]
    idcg = dcg_at_k(ideal_scores, k)
    dcg = dcg_at_k(actual_scores, k)
    return dcg / idcg if idcg > 0 else 0

In [7]:
def get_metrics(similarity_scores_for_query_id, relevant_docs_for_query, k=10):
    precision = precision_at_k(similarity_scores_for_query_id, relevant_docs_for_query, k)
    recall = recall_at_k(similarity_scores_for_query_id, relevant_docs_for_query, k)
    ndcg = ndcg_at_k(similarity_scores_for_query_id, relevant_docs_for_query, k)

    return precision, recall, ndcg

### Annoy

In [8]:
annoy_data = []
for doc in loaded_docs:
    annoy_data.append((doc["id"], doc["embedding"]))

In [9]:
annoy_vs = Annoy.from_embeddings(
    text_embeddings=annoy_data, 
    embedding=OpenAIEmbeddings(), 
    metric="dot")

In [10]:
# !mkdir ./backups/annoy/
annoy_vs.save_local("./backups/annoy/")

In [11]:
# Calculate cosine similarity for each query-document pair
annoy_similarity_scores = {}
for query in tqdm(loaded_queries, desc="Computing similarity scores"):
    scores = annoy_vs.similarity_search_with_score_by_vector(query["embedding"], k=100)
    new_scores = []
    for score in scores:
        new_scores.append((int(score[0].page_content), score[1]))
    annoy_similarity_scores[query["id"]] = sorted(new_scores, key=lambda x: x[1], reverse=True)

Computing similarity scores: 100%|██████████████████████| 1460/1460 [00:03<00:00, 415.04it/s]


In [12]:
with open("./backups/annoy/annoy_similarity_scores.pkl", "wb") as f:
    pickle.dump(annoy_similarity_scores, f)

In [13]:
p_at_k, r_at_k, n_at_k = get_metrics(
    similarity_scores_for_query_id=annoy_similarity_scores[1], 
    relevant_docs_for_query=[1, 898, 361], 
    k=10)
print(f"Precision@10: {p_at_k}, Recall@10: {r_at_k}, NDCG@10: {n_at_k}")

Precision@10: 0.3, Recall@10: 1.0, NDCG@10: 1.0


### FAISS

In [14]:
faiss_vs = FAISS.from_embeddings(
    text_embeddings=annoy_data, 
    embedding=OpenAIEmbeddings(),
    distance_strategy=DistanceStrategy.DOT_PRODUCT)

In [15]:
# !mkdir ./backups/faiss/
faiss_vs.save_local("./backups/faiss/")

In [16]:
# Calculate cosine similarity for each query-document pair
faiss_similarity_scores = {}
for query in tqdm(loaded_queries, desc="Computing similarity scores"):
    scores = faiss_vs.similarity_search_with_score_by_vector(query["embedding"], k=100)
    new_scores = []
    for score in scores:
        new_scores.append((int(score[0].page_content), 1 - score[1]))
    faiss_similarity_scores[query["id"]] = sorted(new_scores, key=lambda x: x[1], reverse=True)

Computing similarity scores: 100%|█████████████████████| 1460/1460 [00:00<00:00, 2335.64it/s]


In [17]:
with open("./backups/faiss/faiss_similarity_scores.pkl", "wb") as f:
    pickle.dump(faiss_similarity_scores, f)

In [18]:
p_at_k, r_at_k, n_at_k = get_metrics(
    similarity_scores_for_query_id=faiss_similarity_scores[1], 
    relevant_docs_for_query=[1, 898, 361], 
    k=10)
print(f"Precision@10: {p_at_k}, Recall@10: {r_at_k}, NDCG@10: {n_at_k}")

Precision@10: 0.3, Recall@10: 1.0, NDCG@10: 1.0
