In [35]:
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores.utils import DistanceStrategy
from tqdm import tqdm 

import ir_datasets
import openai
import os
import pickle

In [None]:
# Get Dataset

In [3]:
dataset = ir_datasets.load("beir/nfcorpus/test")

In [6]:
queries = {}
for query in dataset.queries_iter():
    queries[query.query_id] = {"text":query.text}

In [11]:
with open("./dataset/nfcorpus/queries.pkl", "wb") as f:
    pickle.dump(queries, f)

In [7]:
docs = {}
count = 0
for doc in dataset.docs_iter():
    docs[doc.doc_id] = {"text": doc.text}
    count += 1

In [12]:
with open("./dataset/nfcorpus/documents.pkl", "wb") as f:
    pickle.dump(docs, f)

In [8]:
rel_set = {}
for qrel in dataset.qrels_iter():
    if qrel.query_id not in rel_set:
        rel_set[qrel.query_id] = []
    if qrel.relevance > 0: 
        rel_set[qrel.query_id].append(qrel.doc_id)

In [13]:
with open("./dataset/nfcorpus/relevance_set.pkl", "wb") as f:
    pickle.dump(rel_set, f)

In [None]:
# Get OpenAI Embeddings

In [21]:
load_dotenv()

True

In [22]:
client = openai.OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [23]:
#### API CALL WARNING ####

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    if response and hasattr(response, 'data') and response.data:
        embedding = response.data[0].embedding
        return embedding
    else:
        print("Invalid response or no embedding data received.")
        return None

In [28]:
# get_embedding(queries["PLAIN-2"]["text"])

In [None]:
for idx, query in tqdm(queries.items(), desc = 'Generating Query Embeddings'):
    query_text = query['text']
    queries[idx] = {'text': query_text, 'embedding': get_embedding(query_text)}

In [None]:
for doc_id in tqdm(docs, desc = 'Generating Documents Embeddings'):
    combined_text =  docs[doc_id]['text']
    docs[doc_id]['embedding'] = get_embedding(combined_text)

In [30]:
query_file_path = './openai_embeddings/nfcorpus/query_embeddings.pkl'
docs_file_path = './openai_embeddings/nfcorpus/doc_embeddings.pkl'

In [None]:
with open(query_file_path, "wb") as f:
    pickle.dump(queries, f)

In [None]:
with open(docs_file_path, "wb") as f:
    pickle.dump(docs, f)

In [None]:
# Create VectorDB Index

In [32]:
with open(docs_file_path, 'rb') as file:
    loaded_docs = pickle.load(file)
print("Document embeddings loaded successfully.")

Document embeddings loaded successfully.


In [33]:
data = []
for doc in loaded_docs:
    data.append((doc, loaded_docs[doc]["embedding"]))

In [36]:
faiss_vs = FAISS.from_embeddings(
    text_embeddings=data, 
    embedding=OpenAIEmbeddings(),
    distance_strategy=DistanceStrategy.DOT_PRODUCT)

In [39]:
faiss_vs.save_local("./vectordb/faiss/nfcorpus/")