In [1]:
import os
import openai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from operator import itemgetter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import FAISS
import numpy as np

In [2]:
load_dotenv()

# Uncomment to use the default DaVinci Model 
# llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))
# model = 'text-davinci-003'

# Uncomment to use the gpt-3.5-turbo-instruct model 
llm = OpenAI(model_name='gpt-3.5-turbo-instruct', openai_api_key = os.getenv("OPENAI_API_KEY"))
model = 'gpt-3.5-turbo-instruct'

In [3]:
import ir_datasets


dataset = ir_datasets.load("beir/nfcorpus/test")


queries = {}
for query in dataset.queries_iter():
    queries[query.query_id] = {"text":query.text}

docs = {}
count = 0
for doc in dataset.docs_iter():
    docs[doc.doc_id] = {"text": doc.text}
    count += 1

rel_set = {}
for qrel in dataset.qrels_iter():
    if qrel.query_id not in rel_set:
        rel_set[qrel.query_id] = []
    if qrel.relevance > 0: 
        rel_set[qrel.query_id].append(qrel.doc_id)

[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [14ms]


In [4]:
docs_file_path = './backups/openai_embeddings/doc_embeddings_d2.pkl'

with open(docs_file_path, 'rb') as file:
    docs = pickle.load(file)

print("Document embeddings loaded successfully.")

Document embeddings loaded successfully.


In [5]:
queries_file_path = './backups/openai_embeddings/query_embeddings_d2.pkl'

with open(queries_file_path, 'rb') as file:
    queries = pickle.load(file)

print("Queries embeddings loaded successfully.")

Queries embeddings loaded successfully.


In [7]:
docs['MED-10']['text']

'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants died, of which 3,619 (60.2%) was due to breast cancer. After adjustment for age, tumor characteri

In [8]:
lsi_reduced_docs_file_path = './backups/documents_d2_lsi.pkl'
with open(lsi_reduced_docs_file_path, 'rb') as file:
    documents_reduced = pickle.load(file)

In [9]:
# documents_reduced

In [10]:
lsi_reduced_queries_file_path = './backups/queries_d2_lsi.pkl'
with open(lsi_reduced_queries_file_path, 'rb') as file:
    queries_reduced = pickle.load(file)

In [23]:
# queries_reduced

In [11]:
import numpy as np
from tqdm import tqdm

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Calculate cosine similarity for each query-document pair
similarity_scores = {}
for query_id, query_vector in tqdm(queries_reduced.items(), desc='Computing similarity scores'):
    scores = []
    for doc_id, doc_vector in documents_reduced.items():
        sim_score = cosine_similarity(query_vector, doc_vector)
        scores.append((doc_id, sim_score))
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

# Example usage
# print(similarity_scores['some_query_id'])  # Replace 'some_query_id' with an actual query id

Computing similarity scores: 100%|███████████████████████████████████| 323/323 [00:03<00:00, 99.15it/s]


In [12]:
# Initialize predictions as a dictionary
predictions = {}

# Iterate over similarity scores
for query_id, scores in similarity_scores.items():
    # Flatten the scores to get only document IDs
    scores_flattened = [doc_id for doc_id, _ in scores]
    predictions[query_id] = scores_flattened

# Example usage
# print(predictions['some_query_id'])  # Replace 'some_query_id' with an actual query id


In [15]:
# # Load index from file
# loaded_faiss_vs = FAISS.load_local(
#     folder_path="./backups/faiss/",
#     embeddings=OpenAIEmbeddings())

# retriever = loaded_faiss_vs.as_retriever(search_kwargs={'k': 10})

# Define the RAG pipeline
template = """
Answer the question or Explain the topic given this additional context: {context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [16]:
def format_docs(_docs):
    ls = []
    for doc in _docs:
        if doc.page_content in docs:
            ls.append(docs[doc.page_content]["text"])
    return ls
    # return [docs[int(doc.page_content)]["body"] for doc in _docs]

In [17]:
chain = ({"context": RunnablePassthrough(), "question": RunnablePassthrough()} 
         | prompt 
         | llm 
         | StrOutputParser())

In [18]:
# predictions

In [22]:
mquery_id = 'PLAIN-2'
context_str = '\n\n\n'.join([docs[doc_id]['text'] for doc_id in predictions[mquery_id]])[:4096]
input_data = {"context": context_str, "query": queries[mquery_id]['text']}
print(context_str)
chain.invoke(input_data)

The relation between diet, lifestyle, and acute myeloid leukemia was assessed in a US cohort of 491,163 persons from the NIH–AARP Diet and Health Study (1995–2003). A total of 338 incident cases of acute myeloid leukemia were ascertained. Multivariate Cox models were utilized to estimate hazard ratios and 95% confidence intervals. Compared with those for never smokers, hazard ratios were 1.29 (95% confidence interval: 0.95, 1.75), 1.79 (95% confidence interval: 1.32, 2.42), 2.42 (95% confidence interval: 1.63, 3.57), and 2.29 (85% confidence interval: 1.38, 3.79) for former smokers who smoked ≤1 or >1 pack/day and for current smokers who smoked ≤1 or >1 pack/day, respectively. Higher meat intake was associated with an increased risk of acute myeloid leukemia (hazard ratio = 1.45, 95% confidence interval: 1.02, 2.07 for the fifth vs. first quintile; P for trend = 0.06); however, there were no clear effects of meat-cooking method or doneness level. Individuals who did not drink coffee ap

'Answer: No, the study did not find any association between cholesterol statin drugs and breast cancer. The study focused on the relationship between diet and various types of cancer, including acute myeloid leukemia and kidney stones. The study did not mention cholesterol statin drugs in relation to breast cancer.'

In [23]:
#### API CALL WARNING #####

rag_responses = {}
loq = []
count = 0
# Run RAG pipeline for every question
for query_id in tqdm(rel_set.keys(), desc = 'Asking Queries to ChatGPT with RAG'):
    query_text = queries[query_id]['text']
    context_str = '\n\n\n'.join([docs[doc_id]['text'] for doc_id in predictions[query_id]])[:4000]
    input_data = {"context": context_str, "query": query_text}
    response = chain.invoke(input_data)
    rag_responses[query_id] = response

Asking Queries to ChatGPT with RAG: 100%|████████████████████████████| 323/323 [07:03<00:00,  1.31s/it]


In [24]:
print(queries['PLAIN-2'])

{'text': 'Do Cholesterol Statin Drugs Cause Breast Cancer?', 'embedding': [-0.008303596638143063, -0.020086539909243584, 0.017048802226781845, -0.025426000356674194, -0.02621554397046566, 0.015536624938249588, -0.020153451710939407, 0.013576146215200424, -0.021116962656378746, -0.0008715096046216786, 0.012746455147862434, 0.029333574697375298, 0.0013909025583416224, 0.025894373655319214, -0.00315650412812829, 0.03990544006228447, 0.037978414446115494, 0.005416407249867916, 0.0188018586486578, -0.014412527903914452, -0.01734320819377899, 0.023887058719992638, 0.019015971571207047, 0.020595060661435127, 0.022241059690713882, -0.010886343196034431, 0.02427513897418976, -0.03080560639500618, -0.020648589357733727, 0.011240968480706215, 0.028450356796383858, 0.017691142857074738, -0.016339551657438278, -0.014305470511317253, -0.01283343881368637, 0.01117405854165554, 0.012739764526486397, -0.008999465964734554, 0.0014870865270495415, -0.0008401453378610313, -0.0032936707139015198, 0.0054699

In [25]:
#### DUMP OVERWRITE WARNING ####

rag_responses_file_path = './backups/lsi_openai_with_rag_responses_d2_' + model + '.pkl'
with open(rag_responses_file_path, 'wb') as file:
    pickle.dump(rag_responses, file)

print(f"RAG responses saved to {rag_responses_file_path}")

RAG responses saved to ./backups/lsi_openai_with_rag_responses_d2_gpt-3.5-turbo-instruct.pkl


In [26]:
openai_with_rag_responses_file_path = './backups/lsi_openai_with_rag_responses_d2_' + model + '.pkl'
with open(openai_with_rag_responses_file_path, 'rb') as file:
    rag_responses = pickle.load(file)

In [27]:
 # Sanity
rag_responses['PLAIN-2']

'Answer: No, the study does not mention anything about the relation between cholesterol statin drugs and breast cancer. The study only mentions the association between diet and lifestyle factors, such as smoking and meat intake, and the risk of acute myeloid leukemia, kidney stones, and primary open-angle glaucoma. '

In [28]:
docs['MED-10']['text']

'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years) 6,011 participants died, of which 3,619 (60.2%) was due to breast cancer. After adjustment for age, tumor characteri

In [29]:
# Implement BLEU evaluation function
def compute_bleu(references, candidate):
    smoothing = SmoothingFunction().method5
    return sentence_bleu(references, candidate, smoothing_function=smoothing)

# Implement ROUGE evaluation function
def compute_rouge(references, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    total_score = 0

    # Compute ROUGE for each reference
    for reference in references:
        scores = scorer.score(reference, candidate)
        total_score += scores['rouge1'].fmeasure

    # Calculate average score
    average_score = total_score / len(references)
    return average_score

# Evaluate BLEU and ROUGE for each query

K = 15 # Number of most relevant docs to consider for scoring performance
total_bleu_score = 0.0
total_rouge_score = 0.0
num_queries = 0

for query_id, relevant_docs in rel_set.items():
    query_text = queries[query_id]['text']
    response = rag_responses[query_id]

    # print(query_id, "\n\n", query_text, "\n\nResponse:\n", response, "\nTopmost relevant Doc:\n", docs[relevant_docs[0]]['text'], "\n======\n")
    
    # Evaluate using BLEU
    bleu_score = compute_bleu([docs[id]['text'] for id in relevant_docs[:K]], response)
    total_bleu_score += bleu_score

    # Evaluate using ROUGE
    rouge_score = compute_rouge([docs[id]['text'] for id in relevant_docs[:K]], response)
    total_rouge_score += rouge_score

    num_queries += 1
    if num_queries == 101:
        break

# Calculate mean scores
mean_bleu_score = total_bleu_score / num_queries
mean_rouge_score = total_rouge_score / num_queries

print(f"Mean BLEU Score: {mean_bleu_score:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score:.4f}")

Mean BLEU Score: 0.4709
Mean ROUGE Score: 0.1713


In [18]:
# da-vinci
# Mean BLEU Score: 0.8224
# Mean ROUGE Score: 0.2105

# Mean BLEU Score: 0.8377
# Mean ROUGE Score: 0.2226

# gpt-3.5-turbo-instruct

# Mean BLEU Score: 0.7869
# Mean ROUGE Score: 0.2407

# Mean BLEU Score: 0.7937
# Mean ROUGE Score: 0.2425
