In [1]:
import os
import openai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from operator import itemgetter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import FAISS
import numpy as np

In [17]:
load_dotenv()

# Uncomment to use the default DaVinci Model 
# llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))
# model = 'text-davinci-003'

# Uncomment to use the gpt-3.5-turbo-instruct model 
llm = OpenAI(model_name='gpt-3.5-turbo-instruct', openai_api_key = os.getenv("OPENAI_API_KEY"))
model = 'gpt-3.5-turbo-instruct'

In [18]:
def tokenizer(text):
    return word_tokenize(text.lower())
queries = {}

idx = None
with open("../data/cisi/CISI.QRY") as f:
    for query in f.read().split(".I"):
        for i, line in enumerate(query.split("\n")):
            if not line:
                continue
            elif i == 0:
                idx = int(line)
                queries[idx] = ""
            elif not line.startswith("."):
                queries[idx] += " "+line

In [19]:
docs_file_path = './backups/openai_embeddings/doc_embeddings.pkl'

with open(docs_file_path, 'rb') as file:
    docs = pickle.load(file)

print("Document embeddings loaded successfully.")

Document embeddings loaded successfully.


In [20]:
docs[0]['body']

".W\n   The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad.\n"

In [21]:
rel_set = {}
with open(os.path.join("../data/cisi/", 'CISI.REL')) as f:
    for l in f.readlines():
        qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]
        doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]

        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)

In [22]:
lsi_reduced_docs_file_path = './backups/documents_lsi.pkl'
with open(lsi_reduced_docs_file_path, 'rb') as file:
    documents_reduced = pickle.load(file)

In [23]:
lsi_reduced_queries_file_path = './backups/queries_lsi.pkl'
with open(lsi_reduced_queries_file_path, 'rb') as file:
    queries_reduced = pickle.load(file)

In [24]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate cosine similarity for each query-document pair
similarity_scores = {}
for query_id, query in tqdm(enumerate(queries_reduced), desc = 'Computing similarity scores'):
    scores = []
    for doc_id, doc in enumerate(documents_reduced):
        sim_score = cosine_similarity(query, doc)
        scores.append((doc_id, sim_score))
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

Computing similarity scores: 112it [00:00, 232.88it/s]


In [25]:
predictions = [0]*len(similarity_scores)
for idx, scores in similarity_scores.items():
    scores_flattened = [doc for doc,score in scores]
    predictions[idx] = scores_flattened

In [26]:
# # Load index from file
# loaded_faiss_vs = FAISS.load_local(
#     folder_path="./backups/faiss/",
#     embeddings=OpenAIEmbeddings())

# retriever = loaded_faiss_vs.as_retriever(search_kwargs={'k': 10})

# Define the RAG pipeline
template = """
Answer the question or Explain the topic given this additional context: {context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [27]:
def format_docs(_docs):
    ls = []
    for doc in _docs:
        if int(doc.page_content) in docs:
            ls.append(docs[int(doc.page_content)]["body"])
    return ls
    # return [docs[int(doc.page_content)]["body"] for doc in _docs]

In [28]:
chain = ({"context": RunnablePassthrough(), "question": RunnablePassthrough()} 
         | prompt 
         | llm 
         | StrOutputParser())

In [32]:
mquery_id = 2
# print(queries[mquery_id])
context_str = '\n\n\n'.join([docs[doc_id]['body'] for doc_id in predictions[mquery_id][:8]])
input_data = {"context": context_str, "query": queries[mquery_id]}
chain.invoke(input_data)

'Answer: The use of electronic data processing equipment has been suggested as a potential solution to the mounting arrearages and other processing difficulties faced by research libraries, such as the Library of Congress. Representatives of computer firms have conducted studies indicating that certain areas of library operations could benefit from automation.'

In [36]:
#### API CALL WARNING #####

rag_responses = {}
loq = []
# Run RAG pipeline for every question
for query_id in tqdm(rel_set.keys(), desc = 'Asking Queries to ChatGPT with RAG'):
    query_text = queries[int(query_id)]
    context_str = '\n\n\n'.join([docs[doc_id]['body'] for doc_id in predictions[int(query_id)]])[:4096]
    input_data = {"context": context_str, "query": queries[int(query_id)]}
    response = chain.invoke(input_data)
    rag_responses[query_id] = response

Asking Queries to ChatGPT with RAG: 100%|██████████████████████████████| 76/76 [05:03<00:00,  3.99s/it]


In [37]:
print(queries[61])

 Searching Biases in Large Interactive Document Retrieval Systems Blair, D.C.     The way that individuals construct and modify search queries on a large interactive document retrieval system is subject to systematic biases similar to those that have been demonstrated in experiments on judgements under uncertainty.  These biases are shared by both naive and sophisticated subjects and cause the inquirer searching for documents on a large interactive system to construct and modify queries inefficiently.  A searching algorithm is suggested that helps the inquirer to avoid the effect of these biases. (JASIS, Vol. 31, No. 4, July 1980, pp. 271-277)


In [38]:
#### DUMP OVERWRITE WARNING ####

rag_responses_file_path = './backups/lsi_openai_with_rag_responses_' + model + '.pkl'
with open(rag_responses_file_path, 'wb') as file:
    pickle.dump(rag_responses, file)

print(f"RAG responses saved to {rag_responses_file_path}")

RAG responses saved to ./backups/lsi_openai_with_rag_responses_gpt-3.5-turbo-instruct.pkl


In [39]:
openai_with_rag_responses_file_path = './backups/lsi_openai_with_rag_responses_' + model + '.pkl'
with open(openai_with_rag_responses_file_path, 'rb') as file:
    rag_responses = pickle.load(file)

In [41]:
 # Sanity
rag_responses['1']

"Answer: It can be a challenging task to create descriptive titles for articles, as they must accurately represent the content of the article while also being concise enough to catch the reader's attention. One of the main difficulties in automatically retrieving articles from approximate titles is that the titles may not always accurately reflect the content of the article. This can lead to irrelevant or incorrect results being retrieved. The usual relevance of the content of articles to their titles can vary, as some titles may accurately represent the content while others may only provide a general idea. This is why it is important for titles to be carefully chosen to accurately represent the content of the article."

In [42]:
docs[0]['body']

".W\n   The present study is a history of the DEWEY Decimal\nClassification.  The first edition of the DDC was published\nin 1876, the eighteenth edition in 1971, and future editions\nwill continue to appear as needed.  In spite of the DDC's\nlong and healthy life, however, its full story has never\nbeen told.  There have been biographies of Dewey\nthat briefly describe his system, but this is the first\nattempt to provide a detailed history of the work that\nmore than any other has spurred the growth of\nlibrarianship in this country and abroad.\n"

In [43]:
# Implement BLEU evaluation function
def compute_bleu(references, candidate):
    smoothing = SmoothingFunction().method0
    return sentence_bleu(references, candidate, smoothing_function=smoothing)

# Implement ROUGE evaluation function
def compute_rouge(references, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    total_score = 0

    # Compute ROUGE for each reference
    for reference in references:
        scores = scorer.score(reference, candidate)
        total_score += scores['rouge1'].fmeasure

    # Calculate average score
    average_score = total_score / len(references)
    return average_score

# Evaluate BLEU and ROUGE for each query

K = 30 # Number of most relevant docs to consider for scoring performance
total_bleu_score = 0.0
total_rouge_score = 0.0
num_queries = 0

for query_id, relevant_docs in rel_set.items():
    query_text = queries[int(query_id)]
    response = rag_responses[query_id]

    # print(query_id, query_text, "\n\nResponse:\n", response, "\nTopmost relevant Doc:\n", docs[int(relevant_docs[0])], "\n======\n")
    
    # Evaluate using BLEU
    bleu_score = compute_bleu([docs[int(id)]['body'] for id in relevant_docs[:K]], response)
    total_bleu_score += bleu_score

    # Evaluate using ROUGE
    rouge_score = compute_rouge([docs[int(id)]['body'] for id in relevant_docs[:K]], response)
    total_rouge_score += rouge_score

    num_queries += 1
    # if num_queries == 10:
    #     break

# Calculate mean scores
mean_bleu_score = total_bleu_score / num_queries
mean_rouge_score = total_rouge_score / num_queries

print(f"Mean BLEU Score: {mean_bleu_score:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score:.4f}")

Mean BLEU Score: 0.7441
Mean ROUGE Score: 0.2167


In [18]:
# da-vinci
# Mean BLEU Score: 0.8224
# Mean ROUGE Score: 0.2105

# Mean BLEU Score: 0.8377
# Mean ROUGE Score: 0.2226

# gpt-3.5-turbo-instruct

# Mean BLEU Score: 0.7869
# Mean ROUGE Score: 0.2407

# Mean BLEU Score: 0.7937
# Mean ROUGE Score: 0.2425
