In [1]:
# Imports
from dotenv import load_dotenv
from openai import OpenAI
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

import os
import pickle

In [2]:
# Load documents' texts
with open("../dataset/documents.pkl", "rb") as f:
    docs = pickle.load(f)

In [3]:
# Load queries' texts
with open("../dataset/queries.pkl", "rb") as f:
    queries = pickle.load(f)

### LSI

In [4]:
# Extract document ids and clean_texts
document_ids = list(docs.keys())
clean_documents = [d["clean_text"] for d in docs.values()]

In [5]:
# Extract query ids and clean_texts
query_ids = list(queries.keys())
clean_queries = [q["clean_text"] for q in queries.values()]

#### Tf-idf

In [7]:
vectorizer = TfidfVectorizer()
vectorizer.fit(clean_documents + clean_queries)

In [8]:
# Get tfidf vector for each document
documents_tfidf_vectors = vectorizer.transform(clean_documents)

In [9]:
# Get tfidf vector for each query
queries_tfidf_vectors = vectorizer.transform(clean_queries)

#### SVD

In [11]:
num_topics = 100  # Number of dimensions
svd = TruncatedSVD(n_components=num_topics)

In [12]:
# Get embedding for each document
documents_lsi_vectors = svd.fit_transform(documents_tfidf_vectors)

In [None]:
# Save document embeddings
documents_lsi = {id: {"embedding": vector} for id, vector in zip(
    document_ids, documents_lsi_vectors)}
with open("../embeddings/lsi/documents.pkl", "wb") as f:
    pickle.dump(documents_lsi, f)

In [13]:
# Get embedding for each query
queries_lsi_vectors = svd.fit_transform(queries_tfidf_vectors)

In [None]:
# Save query embeddings
queries_lsi = {id: {"embedding": vector} for id, vector in zip(
    query_ids, queries_lsi_vectors)}
with open("../embeddings/lsi/queries.pkl", "wb") as f:
    pickle.dump(queries_lsi, f)

### ADA (text-embedding-ada-002-v2)

In [19]:
# Load OpenAI's API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [20]:
# Initialize OpenAI's client
client = OpenAI(api_key=OPENAI_API_KEY)

In [21]:
# Use OpenAI's text-embedding-ada-002 model to get a 1536-dimensional
#  embedding for the given input text
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    if response and hasattr(response, "data") and response.data:
        embedding = response.data[0].embedding
        return embedding
    else:
        print("Invalid response or no embedding data received.")
        return None

In [None]:
#### API CALL WARNING ####

# Get embedding for each document
documents_ada = {}
for i, doc in tqdm(docs.items(), desc="Generating Document Embeddings"):
    documents_ada[i] = {"embedding": get_embedding(doc["text"])}

In [None]:
# Save document embeddings
with open("../embeddings/text-embedding-ada-002-v2/documents.pkl", "wb") as f:
    pickle.dump(documents_ada, f)

In [24]:
#### API CALL WARNING ####

# Get embedding for each query
queries_ada = {}
for i, query in tqdm(queries.items(), desc="Generating Query Embeddings"):
    queries_ada[i] = {"embedding": get_embedding(query["text"])}

Generating Query Embeddings: 100%|█████████████████████████| 112/112 [00:23<00:00,  4.85it/s]


In [25]:
# Save query embeddings
with open("../embeddings/text-embedding-ada-002-v2/queries.pkl", "wb") as f:
    pickle.dump(queries_ada, f)