In [1]:
# Imports
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Annoy, FAISS
from langchain.vectorstores.utils import DistanceStrategy
from tqdm import tqdm

import numpy as np
import os
import pickle
import time

### Exact Search (LSI)

In [3]:
# Load documents' LSI embeddings
with open("../embeddings/lsi/documents.pkl", "rb") as f:
    documents_lsi = pickle.load(f)

In [4]:
# Load queries' LSI embeddings
with open("../embeddings/lsi/queries.pkl", "rb") as f:
    queries_lsi = pickle.load(f)

In [2]:
# Find cosine similarity between two vectors a and b
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [5]:
# Calculate cosine similarity for each query-document pair
similarity_scores = {}
for query_id, query in tqdm(queries_lsi.items(), desc="Computing similarity scores"):
    scores = []
    for doc_id, doc in documents_lsi.items():
        sim_score = cosine_similarity(query["embedding"], doc["embedding"])
        scores.append((doc_id, sim_score))
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

Computing similarity scores: 100%|████████████████████████| 112/112 [00:00<00:00, 198.37it/s]


In [6]:
# Example usage
# print(similarity_scores['some_query_id'])  # Replace 'some_query_id' with an actual query id

In [7]:
predictions = {}
for query_id, scores in similarity_scores.items():
    # Flatten the scores to get only document IDs
    scores_flattened = [doc_id for doc_id, _ in scores]
    predictions[query_id] = scores_flattened

In [8]:
# Example usage
# print(predictions['some_query_id'])  # Replace 'some_query_id' with an actual query id

In [9]:
# Save query and its most relevant documents
with open("../ir_techniques/exact_search/index.pkl", "wb") as f:
    pickle.dump(predictions, f)

### VectorDBs

In [10]:
# Load OpenAI's API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [11]:
# Load document' ADA embeddings
with open("../embeddings/text-embedding-ada-002-v2/documents.pkl", "rb") as f:
    documents_ada = pickle.load(f)

In [12]:
# Create a list of tuple of the form: (document, embedding)
# This is the format accepted by vector stores defined in langchain
data = []
for i, doc in documents_ada.items():
    data.append((i, doc["embedding"]))

#### ANNOY

In [13]:
%time

# Build ANNOY vector store
annoy_vs = Annoy.from_embeddings(
    text_embeddings=data, 
    embedding=OpenAIEmbeddings(), 
    metric="dot")

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 1.91 µs


In [14]:
# Save ANNOY vector store
annoy_vs.save_local("../ir_techniques/annoy/")

#### FAISS

In [15]:
%time

# Build FAISS vector store
faiss_vs = FAISS.from_embeddings(
    text_embeddings=data, 
    embedding=OpenAIEmbeddings(),
    distance_strategy=DistanceStrategy.DOT_PRODUCT)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 2.15 µs


In [16]:
# Save FAISS vector store
faiss_vs.save_local("../ir_techniques/faiss/")