In [27]:
import os
import openai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from operator import itemgetter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import FAISS

In [28]:
load_dotenv()
llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))

In [29]:
def tokenizer(text):
    return word_tokenize(text.lower())
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line

In [30]:
# queries

In [31]:
docs_file_path = './backups/openai_embeddings/doc_embeddings.pkl'

with open(docs_file_path, 'rb') as file:
    docs = pickle.load(file)

print("Document embeddings loaded successfully.")

Document embeddings loaded successfully.


In [32]:
docs[1]['body']

'.W\nThis report is an analysis of 6300 acts of use\nin 104 technical libraries in the United Kingdom.\nLibrary use is only one aspect of the wider pattern of\ninformation use.  Information transfer in libraries is\nrestricted to the use of documents.  It takes no\naccount of documents used outside the library, still\nless of information transferred orally from person\nto person.  The library acts as a channel in only a\nproportion of the situations in which information is\ntransferred.\nTaking technical information transfer as a whole,\nthere is no doubt that this proportion is not the\nmajor one.  There are users of technical information -\nparticularly in technology rather than science -\nwho visit libraries rarely if at all, relying on desk\ncollections of handbooks, current periodicals and personal\ncontact with their colleagues and with people in other\norganizations.  Even regular library users also receive\ninformation in other ways.\n'

In [33]:
rel_set = {}
with open(os.path.join("../data/cisi/", 'CISI.REL')) as f:
    for l in f.readlines():
        qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]
        doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]

        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)

In [9]:
#### DUMP OVERWRITE WARNING ####

# Build the document vector store

K = 10 # Hyperparameter governing how many top ranking documents are used for RAG IR
doc_texts = [docs[int(doc_id) - 1]['body'] for query_id in rel_set.keys() for doc_id in rel_set[query_id][:K]]

openai_answers_file_path = './backups/rag_contexts/cisi/doc_texts.pkl'

with open(openai_answers_file_path, 'wb') as file:
    pickle.dump(doc_texts, file)

In [36]:
docs_file_path = './backups/rag_contexts/cisi/doc_texts.pkl'

with open(docs_file_path, 'rb') as file:
    doc_texts = pickle.load(file)

print("Document texts loaded successfully.")

Document texts loaded successfully.


In [37]:
# Build the document vector store
# doc_texts = [docs[int(doc_id) - 1]['body'] for query_id in rel_set.keys() for doc_id in rel_set[query_id]]
vectorstore = FAISS.from_texts(doc_texts, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

# Define the RAG pipeline
template = """Answer the question based only on the following context:
              {context}
              Question: {question}
            """
prompt = ChatPromptTemplate.from_template(template)

In [12]:
# This code converts the question and context into a prompt, passes it to the llm, and parses the llm output 
chain = ( {"context": retriever, "question": RunnablePassthrough()} | prompt  | llm | StrOutputParser() )

In [14]:
#### API CALL WARNING #####

rag_responses = {}

# Run RAG pipeline for every question
for query_id in tqdm(rel_set.keys(), desc = 'Asking Queries to ChatGPT with RAG'):
    query_text = queries[int(query_id)]
    response = chain.invoke(query_text)
    rag_responses[query_id] = response

# Save RAG responses to a pickle file
rag_responses_file_path = './backups/openai_with_rag_responses.pkl'
with open(rag_responses_file_path, 'wb') as file:
    pickle.dump(rag_responses, file)

print(f"RAG responses saved to {rag_responses_file_path}")

RAG responses saved to ./backups/openai_with_rag_responses.pkl


In [None]:
openai_with_rag_responses_file_path = './backups/openai_with_rag_responses.pkl'
with open(openai_with_rag_responses_file_path, 'rb') as file:
    rag_responses = pickle.load(file)

In [26]:
 # Sanity
rag_responses['30']

'\nAnswer: Information dissemination by journals and periodicals is an important part of scientific libraries, as it provides access to a wealth of experimental data. In order to make the most efficient use of funds, libraries must adopt a highly selective purchasing plan to ensure that the needs of readers are met. However, the development of a national information system is beginning to encroach on the domain of the primary publication system, which may endanger orderly communication through research journals. Additionally, increased distribution of unedited, unreferred and unproofed preprints could further disrupt journals or transform them into depositories.'