In [69]:
import os
import openai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import pickle
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from operator import itemgetter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import FAISS
import numpy as np
import time
from langchain.vectorstores import Annoy, FAISS

In [71]:
load_dotenv()

True

In [26]:
# Load CISI documents & queries

with open("./cisi/dataset/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Documents loaded succesfully.")

with open("./cisi/dataset/queries.pkl", "rb") as f:
    queries = pickle.load(f)

print("Queries loaded succesfully.")

# Load CISI ground truth
with open("./cisi/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Ground truth loaded succesfully.


In [33]:
ground_truth = list(ground_truth.values()) 

In [27]:
# Load LSI predictions
with open("./cisi/ir_techniques/exact_search/index.pkl", "rb") as f:
    lsi_predictions = pickle.load(f)

print("LSI Predictions loaded succesfully.")

LSI Predictions loaded succesfully.


In [28]:
lsi_predictions = list(lsi_predictions.values())

## Information Retrieval - Experimental Results

#### LSI

In [29]:
def precision_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = 0
    for doc_id in ranked_docs[:k]:
        if doc_id in relevant_docs:
            retrieved_relevant += 1
    return retrieved_relevant / k

def recall_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = sum(1 for doc_id in ranked_docs[:k] if doc_id in relevant_docs)
    return retrieved_relevant / len(relevant_docs) if relevant_docs else 0

In [36]:
mean_precision_at_k = np.mean([precision_at_k(preds,label) for preds,label in zip(lsi_predictions,ground_truth)])
mean_recall_at_k = np.mean([recall_at_k(preds,label) for preds,label in zip(lsi_predictions,ground_truth)])
mean_sps = np.mean([precision_at_k(preds,label,1) for preds,label in zip(lsi_predictions,ground_truth)])

In [37]:
mean_precision_at_k, mean_recall_at_k, mean_sps

(0.014473684210526317, 0.005305212696304631, 0.013157894736842105)

#### OpenAI Embeddings

In [46]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [47]:
# Load OpenAI embeddings
with open("./cisi/embeddings/text-embedding-ada-002-v2/documents.pkl", "rb") as f:
    documents_openai = pickle.load(f)

with open("./cisi/embeddings/text-embedding-ada-002-v2/queries.pkl", "rb") as f:
    queries_openai = pickle.load(f)

In [66]:
# queries_openai

In [55]:
# Calculate cosine similarity for each query-document pair
similarity_scores = {}
execution_times = [] 
for query_id, query in tqdm(queries_openai.items(), desc = 'Computing similarity scores'):
    query_embedding = query['embedding']
    scores = []
    start_time = time.time()
    for doc in documents_openai:
        doc_embedding = documents_openai[doc]['embedding']
        sim_score = cosine_similarity(query_embedding, doc_embedding)
        scores.append((doc, sim_score))
    
    end_time = time.time()  # Record end time
    execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
    execution_times.append(execution_time)
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

mean_execution_time = sum(execution_times) / len(execution_times)
print(f"Mean execution time for all queries: {mean_execution_time:.2f} ms")

Computing similarity scores: 100%|████████████| 112/112 [00:28<00:00,  3.95it/s]

Mean execution time for all queries: 252.24 ms





In [60]:
openai_predictions = {}
for query_id, scores in similarity_scores.items():
    # Flatten the scores to get only document IDs
    scores_flattened = [doc_id for doc_id, _ in scores]
    openai_predictions[query_id] = scores_flattened

In [64]:
openai_predictions = list(openai_predictions.values())

In [65]:
mean_precision_at_k = np.mean([precision_at_k(preds,label) for preds,label in zip(openai_predictions,ground_truth)])
mean_recall_at_k = np.mean([recall_at_k(preds,label) for preds,label in zip(openai_predictions,ground_truth)])
mean_sps = np.mean([precision_at_k(preds,label,1) for preds,label in zip(openai_predictions,ground_truth)])

In [67]:
mean_precision_at_k, mean_recall_at_k, mean_sps

(0.1828947368421053, 0.047888229233635914, 0.19736842105263158)

#### VectorDB - FAISS

In [102]:
# Load index from file
loaded_faiss_vs = FAISS.load_local(
    folder_path="./cisi/ir_techniques/faiss/",
    embeddings=OpenAIEmbeddings())

In [103]:
# Calculate cosine similarity for each query-document pair
faiss_similarity_scores = {}
execution_times = [] 
for query_id, query in tqdm(queries_openai.items(), desc="Computing similarity scores"):
    start_time = time.time()
    scores = loaded_faiss_vs.similarity_search_with_score_by_vector(query["embedding"], k=100)
    end_time = time.time()  # Record end time
    execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
    execution_times.append(execution_time)

    new_scores = []
    for score in scores:
        new_scores.append((int(score[0].page_content), 1 - score[1]))
    faiss_similarity_scores[query_id] = sorted(new_scores, key=lambda x: x[1], reverse=True)

mean_execution_time = sum(execution_times) / len(execution_times)
print(f"Mean execution time for all queries: {mean_execution_time:.2f} ms")

Computing similarity scores: 100%|███████████| 112/112 [00:00<00:00, 646.63it/s]

Mean execution time for all queries: 1.40 ms





In [104]:
faiss_predictions = [0]*(len(faiss_similarity_scores)+1)
for idx, scores in faiss_similarity_scores.items():
    scores_flattened = [doc for doc, score in scores]
    faiss_predictions[idx] = scores_flattened

In [106]:
faiss_mean_precision_at_k = np.mean(
    [precision_at_k(preds, label) for preds, label in zip(faiss_predictions[1:], ground_truth)])
faiss_mean_recall_at_k = np.mean(
    [recall_at_k(preds, label) for preds, label in zip(faiss_predictions[1:], ground_truth)])
faiss_mean_sps = np.mean(
    [precision_at_k(preds, label, 1) for preds, label in zip(faiss_predictions[1:], ground_truth)])

In [107]:
faiss_mean_precision_at_k, faiss_mean_recall_at_k, faiss_mean_sps

(0.1828947368421053, 0.047888229233635914, 0.19736842105263158)

#### VectorDB - ANNOY

In [72]:
# Load ANNOY index from file
loaded_annoy_vs = Annoy.load_local(
    folder_path="./cisi/ir_techniques/annoy/", 
    embeddings=OpenAIEmbeddings())

In [87]:
# Calculate cosine similarity for each query-document pair
annoy_similarity_scores = {}
execution_times = [] 
for query_id, query in tqdm(queries_openai.items(), desc="Computing similarity scores"):
    start_time = time.time()
    scores = loaded_annoy_vs.similarity_search_with_score_by_vector(query["embedding"], k=100)
    end_time = time.time()  # Record end time
    execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
    execution_times.append(execution_time)

    new_scores = []
    for score in scores:
        new_scores.append((int(score[0].page_content), score[1]))
    annoy_similarity_scores[query_id] = sorted(new_scores, key=lambda x: x[1], reverse=True)

mean_execution_time = sum(execution_times) / len(execution_times)
print(f"Mean execution time for all queries: {mean_execution_time:.2f} ms")

Computing similarity scores: 100%|███████████| 112/112 [00:00<00:00, 323.47it/s]

Mean execution time for all queries: 3.03 ms





In [97]:
annoy_predictions = [0]*(len(annoy_similarity_scores)+1)
for idx, scores in annoy_similarity_scores.items():
    scores_flattened = [doc for doc, score in scores]
    annoy_predictions[idx] = scores_flattened

In [99]:
annoy_mean_precision_at_k = np.mean(
    [precision_at_k(preds, label) for preds, label in zip(annoy_predictions[1:], ground_truth)])
annoy_mean_recall_at_k = np.mean(
    [recall_at_k(preds, label) for preds, label in zip(annoy_predictions[1:], ground_truth)])
annoy_mean_sps = np.mean(
    [precision_at_k(preds, label, 1) for preds, label in zip(annoy_predictions[1:], ground_truth)])

In [100]:
annoy_mean_precision_at_k, annoy_mean_recall_at_k, annoy_mean_sps

(0.1828947368421053, 0.047888229233635914, 0.19736842105263158)

## RAG Evaluation

### CISI Dataset

In [141]:
# Load CISI ground truth
with open("./cisi/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Ground truth loaded succesfully.


In [113]:
# Load davinci-0.0.3 responses
with open("./cisi/responses/da-vinci-0.0.3/llm_w_rag_faiss.pkl", "rb") as f:
    da_vinci_llm_w_rag = pickle.load(f)

with open("./cisi/responses/da-vinci-0.0.3/llm_wo_rag.pkl", "rb") as f:
    da_vinci_llm_wo_rag = pickle.load(f)

print("da-vinci-0.0.3 responses with & without RAG loaded succesfully.")

da-vinci-0.0.3 responses with & without RAG loaded succesfully.


In [114]:
# Load gpt-3.5-turbo-instruct responses
with open("./cisi/responses/gpt-3.5-turbo-instruct/llm_w_rag_faiss.pkl", "rb") as f:
    gpt_llm_w_rag_faiss = pickle.load(f)

with open("./cisi/responses/gpt-3.5-turbo-instruct/llm_w_rag_exact_search.pkl", "rb") as f:
    gpt_llm_w_rag_exact_search = pickle.load(f)
    
with open("./cisi/responses/gpt-3.5-turbo-instruct/llm_wo_rag.pkl", "rb") as f:
    gpt_llm_wo_rag = pickle.load(f)

print("gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.")

gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.


In [115]:
# Load llama-7b responses
with open("./cisi/responses/llama-7b/llm_w_rag_faiss.pkl", "rb") as f:
    llama_7b_llm_w_rag = pickle.load(f)

with open("./cisi/responses/llama-7b/llm_wo_rag.pkl", "rb") as f:
    llama_7b_llm_wo_rag = pickle.load(f)

print("llama-7b responses with & without RAG loaded succesfully.")

llama-7b responses with & without RAG loaded succesfully.


In [187]:
# Implement BLEU evaluation function
def compute_bleu(references, candidate):
    smoothing = SmoothingFunction().method5
    return sentence_bleu(references, candidate, smoothing_function=smoothing)

# Implement ROUGE evaluation function
def compute_rouge(references, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    total_score = 0

    # Compute ROUGE for each reference
    for reference in references:
        scores = scorer.score(reference, candidate)
        total_score += scores['rouge1'].fmeasure

    # Calculate average score
    if references:
        average_score = total_score / len(references)
    else:
        average_score = 0
    return average_score



def compute_mean_bleu_score(rag_responses, relevant_docs, K = 10):
    total_bleu_score = 0.0
    num_queries = 0
    for query_id, relevant_docs in ground_truth.items():
        query_text = queries[query_id]['text']
        response = rag_responses[query_id]['response']
        if relevant_docs:
            bleu_score = compute_bleu([documents[id]['text'] for id in relevant_docs[:K]], response)
        else:
            bleu_score = 0
        total_bleu_score += bleu_score
        num_queries += 1
        if num_queries == 101:
            break
    mean_bleu_score = total_bleu_score / num_queries
    return mean_bleu_score

def compute_mean_rouge_score(rag_responses, relevant_docs, K = 10):
    total_rouge_score = 0.0
    num_queries = 0
    for query_id, relevant_docs in ground_truth.items():
        query_text = queries[query_id]['text']
        response = rag_responses[query_id]['response']
        rouge_score = compute_rouge([documents[id]['text'] for id in relevant_docs[:K]], response)
        total_rouge_score += rouge_score
        num_queries += 1
        if num_queries == 101:
            break
    mean_rouge_score = total_rouge_score / num_queries
    return mean_rouge_score

#### da-vinci-0.0.3

In [192]:
# Compute for davinci with without RAG

mean_bleu_score_davinci_wo_rag = compute_mean_bleu_score(da_vinci_llm_wo_rag, ground_truth)
mean_bleu_score_davinci_w_rag = compute_mean_bleu_score(da_vinci_llm_w_rag, ground_truth)
mean_rouge_score_davinci_wo_rag = compute_mean_rouge_score(da_vinci_llm_wo_rag, ground_truth)
mean_rouge_score_davinci_w_rag = compute_mean_rouge_score(da_vinci_llm_w_rag, ground_truth)

In [201]:
print("Without RAG\n============")
print(f"Mean BLEU Score: {mean_bleu_score_davinci_wo_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_davinci_wo_rag:.4f}")

print("\nWith RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_davinci_w_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_davinci_w_rag:.4f}")

Without RAG
Mean BLEU Score: 0.6076
Mean ROUGE Score: 0.1759
With RAG:
Mean BLEU Score: 0.8368
Mean ROUGE Score: 0.2370


#### gpt-3.5-turbo-instruct

In [194]:
# Compute for gpt without RAG

mean_bleu_score_gpt_wo_rag = compute_mean_bleu_score(gpt_llm_wo_rag, ground_truth)
mean_rouge_score_gpt_wo_rag = compute_mean_rouge_score(gpt_llm_wo_rag, ground_truth)

##### NLA Method (LSI + Truncated SVD) for RAG

In [195]:
mean_bleu_score_gpt_w_lsi_rag = compute_mean_bleu_score(gpt_llm_w_rag_exact_search, ground_truth)
mean_rouge_score_gpt_w_lsi_rag = compute_mean_rouge_score(gpt_llm_w_rag_exact_search, ground_truth)

##### SOTA Method (VectorDB i.e. FAISS) for RAG

In [196]:
mean_bleu_score_gpt_w_faiss_rag = compute_mean_bleu_score(gpt_llm_w_rag_faiss, ground_truth)
mean_rouge_score_gpt_w_faiss_rag = compute_mean_rouge_score(gpt_llm_w_rag_faiss, ground_truth)

In [199]:
print("Without RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_gpt_wo_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_gpt_wo_rag:.4f}")

print("\nWith LSI RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_gpt_w_lsi_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_gpt_w_lsi_rag:.4f}")

print("\nWith VectorDB RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_gpt_w_faiss_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_gpt_w_faiss_rag:.4f}")

Without RAG:
Mean BLEU Score: 0.8429
Mean ROUGE Score: 0.2724

With LSI RAG
Mean BLEU Score: 0.6967
Mean ROUGE Score: 0.2222

With VectorDB RAG
Mean BLEU Score: 0.8273
Mean ROUGE Score: 0.2675


#### llama-7b

In [203]:
# Compute for llama-7b with without RAG

mean_bleu_score_llama_7b_wo_rag = compute_mean_bleu_score(llama_7b_llm_wo_rag, ground_truth)
mean_bleu_score_llama_7b_w_rag = compute_mean_bleu_score(llama_7b_llm_w_rag, ground_truth)
mean_rouge_score_llama_7b_wo_rag = compute_mean_rouge_score(llama_7b_llm_wo_rag, ground_truth)
mean_rouge_score_llama_7b_w_rag = compute_mean_rouge_score(llama_7b_llm_w_rag, ground_truth)

In [204]:
print("Without RAG\n============")
print(f"Mean BLEU Score: {mean_bleu_score_llama_7b_wo_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_llama_7b_wo_rag:.4f}")

print("\nWith RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_llama_7b_w_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_llama_7b_w_rag:.4f}")

Without RAG
Mean BLEU Score: 0.6943
Mean ROUGE Score: 0.2218

With RAG:
Mean BLEU Score: 0.8008
Mean ROUGE Score: 0.2613


### arguAna Dataset

In [210]:
# Load arguAna documents & queries

with open("./arguana/dataset/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Documents loaded succesfully.")

with open("./arguana/dataset/queries.pkl", "rb") as f:
    queries = pickle.load(f)

print("Queries loaded succesfully.")

# Load CISI ground truth
with open("./arguana/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Documents loaded succesfully.
Queries loaded succesfully.
Ground truth loaded succesfully.


In [211]:
# Load gpt-3.5-turbo-instruct responses
with open("./arguana/responses/gpt-3.5-turbo-instruct/llm_w_rag_faiss.pkl", "rb") as f:
    gpt_llm_w_rag_faiss = pickle.load(f)

with open("./arguana/responses/gpt-3.5-turbo-instruct/llm_w_rag_exact_search.pkl", "rb") as f:
    gpt_llm_w_rag_exact_search = pickle.load(f)
    
print("gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.")

gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.


#### gpt-3.5-turbo-instruct

##### NLA Method (LSI + Truncated SVD) for RAG

In [212]:
mean_bleu_score_gpt_w_lsi_rag = compute_mean_bleu_score(gpt_llm_w_rag_exact_search, ground_truth)
mean_rouge_score_gpt_w_lsi_rag = compute_mean_rouge_score(gpt_llm_w_rag_exact_search, ground_truth)

##### SOTA Method (VectorDB i.e. FAISS) for RAG

In [213]:
mean_bleu_score_gpt_w_faiss_rag = compute_mean_bleu_score(gpt_llm_w_rag_faiss, ground_truth)
mean_rouge_score_gpt_w_faiss_rag = compute_mean_rouge_score(gpt_llm_w_rag_faiss, ground_truth)

In [215]:
print("\nWith LSI RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_gpt_w_lsi_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_gpt_w_lsi_rag:.4f}")

print("\nWith VectorDB RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_gpt_w_faiss_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_gpt_w_faiss_rag:.4f}")


With LSI RAG:
Mean BLEU Score: 0.2721
Mean ROUGE Score: 0.2289

With VectorDB RAG:
Mean BLEU Score: 0.2906
Mean ROUGE Score: 0.2544


### NFCorpus Dataset

In [216]:
# Load arguAna documents & queries

with open("./nfcorpus/dataset/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Documents loaded succesfully.")

with open("./nfcorpus/dataset/queries.pkl", "rb") as f:
    queries = pickle.load(f)

print("Queries loaded succesfully.")

# Load CISI ground truth
with open("./nfcorpus/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Documents loaded succesfully.
Queries loaded succesfully.
Ground truth loaded succesfully.


In [217]:
# Load gpt-3.5-turbo-instruct responses
with open("./nfcorpus/responses/gpt-3.5-turbo-instruct/llm_w_rag_faiss.pkl", "rb") as f:
    gpt_llm_w_rag_faiss = pickle.load(f)

with open("./nfcorpus/responses/gpt-3.5-turbo-instruct/llm_w_rag_exact_search.pkl", "rb") as f:
    gpt_llm_w_rag_exact_search = pickle.load(f)
    
print("gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.")

gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.


#### gpt-3.5-turbo-instruct

##### NLA Method (LSI + Truncated SVD) for RAG

In [None]:
mean_bleu_score_gpt_w_lsi_rag = compute_mean_bleu_score(gpt_llm_w_rag_exact_search, ground_truth)
mean_rouge_score_gpt_w_lsi_rag = compute_mean_rouge_score(gpt_llm_w_rag_exact_search, ground_truth)

##### SOTA Method (VectorDB i.e. FAISS) for RAG

In [213]:
mean_bleu_score_gpt_w_faiss_rag = compute_mean_bleu_score(gpt_llm_w_rag_faiss, ground_truth)
mean_rouge_score_gpt_w_faiss_rag = compute_mean_rouge_score(gpt_llm_w_rag_faiss, ground_truth)

In [215]:
print("\nWith LSI RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_gpt_w_lsi_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_gpt_w_lsi_rag:.4f}")

print("\nWith VectorDB RAG:\n============")
print(f"Mean BLEU Score: {mean_bleu_score_gpt_w_faiss_rag:.4f}")
print(f"Mean ROUGE Score: {mean_rouge_score_gpt_w_faiss_rag:.4f}")


With LSI RAG:
Mean BLEU Score: 0.2721
Mean ROUGE Score: 0.2289

With VectorDB RAG:
Mean BLEU Score: 0.2906
Mean ROUGE Score: 0.2544
