In [1]:
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Annoy, FAISS
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from tqdm import tqdm

import numpy as np
import os
import pickle
import time

In [2]:
# Load env variables
load_dotenv()

True

## Information Retrieval - Metrics Calculation

In [3]:
# Load CISI documents & queries

with open("./cisi/embeddings/lsi/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Documents loaded succesfully.")

with open("./cisi/embeddings/lsi/queries.pkl", "rb") as f:
    queries = pickle.load(f)

print("Queries loaded succesfully.")

# Load CISI ground truth
with open("./cisi/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Documents loaded succesfully.
Queries loaded succesfully.
Ground truth loaded succesfully.


In [4]:
ground_truth = list(ground_truth.values()) 

In [12]:
def precision_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = 0
    for doc_id in ranked_docs[:k]:
        if doc_id in relevant_docs:
            retrieved_relevant += 1
    return retrieved_relevant / k

def recall_at_k(ranked_docs, relevant_docs, k=10):
    retrieved_relevant = sum(1 for doc_id in ranked_docs[:k] if doc_id in relevant_docs)
    return retrieved_relevant / len(relevant_docs) if relevant_docs else 0

In [13]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

#### LSI

In [14]:
# Load LSI predictions for CISI
with open("./cisi/ir_techniques/exact_search/index.pkl", "rb") as f:
    lsi_predictions = pickle.load(f)

print("LSI Predictions loaded succesfully.")

LSI Predictions loaded succesfully.


In [15]:
lsi_predictions = list(lsi_predictions.values())

In [16]:
# print(lsi_predictions[1], ground_truth[1])

In [17]:
mean_precision_at_k_lsi = np.mean([precision_at_k(preds,label) for preds,label in zip(lsi_predictions,ground_truth)])
mean_recall_at_k_lsi = np.mean([recall_at_k(preds,label) for preds,label in zip(lsi_predictions,ground_truth)])
mean_sps_lsi = np.mean([precision_at_k(preds,label,1) for preds,label in zip(lsi_predictions,ground_truth)])

#### OpenAI Embeddings

In [19]:
# Load OpenAI embeddings
with open("./cisi/embeddings/text-embedding-ada-002-v2/documents.pkl", "rb") as f:
    documents_openai = pickle.load(f)

with open("./cisi/embeddings/text-embedding-ada-002-v2/queries.pkl", "rb") as f:
    queries_openai = pickle.load(f)

In [20]:
# queries_openai

In [21]:
# Calculate cosine similarity for each query-document pair
similarity_scores = {}
execution_times = [] 
for query_id, query in tqdm(queries_openai.items(), desc = 'Computing similarity scores'):
    query_embedding = query['embedding']
    scores = []
    start_time = time.time()
    for doc in documents_openai:
        doc_embedding = documents_openai[doc]['embedding']
        sim_score = cosine_similarity(query_embedding, doc_embedding)
        scores.append((doc, sim_score))
    
    end_time = time.time()  # Record end time
    execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
    execution_times.append(execution_time)
    similarity_scores[query_id] = sorted(scores, key=lambda x: x[1], reverse=True)

mean_execution_time_openai = sum(execution_times) / len(execution_times)
print(f"Mean execution time for all queries: {mean_execution_time:.2f} ms")

Computing similarity scores: 100%|████████████| 112/112 [00:29<00:00,  3.82it/s]

Mean execution time for all queries: 260.63 ms





In [22]:
openai_predictions = {}
for query_id, scores in similarity_scores.items():
    # Flatten the scores to get only document IDs
    scores_flattened = [doc_id for doc_id, _ in scores]
    openai_predictions[query_id] = scores_flattened

In [23]:
openai_predictions = list(openai_predictions.values())

In [24]:
len(openai_predictions[0]), len(ground_truth[0])

(1460, 3114)

In [25]:
mean_precision_at_k_openai = np.mean([precision_at_k(preds,label) for preds,label in zip(openai_predictions,ground_truth)])
mean_recall_at_k_openai = np.mean([recall_at_k(preds,label) for preds,label in zip(openai_predictions,ground_truth)])
mean_sps_openai = np.mean([precision_at_k(preds,label,1) for preds,label in zip(openai_predictions,ground_truth)])

#### VectorDB - FAISS

In [27]:
# Load index from file
loaded_faiss_vs = FAISS.load_local(
    folder_path="./cisi/ir_techniques/faiss/",
    embeddings=OpenAIEmbeddings())

In [28]:
# Calculate cosine similarity for each query-document pair
faiss_similarity_scores = {}
execution_times = [] 
for query_id, query in tqdm(queries_openai.items(), desc="Computing similarity scores"):
    start_time = time.time()
    scores = loaded_faiss_vs.similarity_search_with_score_by_vector(query["embedding"], k=100)
    end_time = time.time()  # Record end time
    execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
    execution_times.append(execution_time)

    new_scores = []
    for score in scores:
        new_scores.append((int(score[0].page_content), 1 - score[1]))
    faiss_similarity_scores[query_id] = sorted(new_scores, key=lambda x: x[1], reverse=True)

mean_execution_time_faiss = sum(execution_times) / len(execution_times)
print(f"Mean execution time for all queries: {mean_execution_time:.2f} ms")

Computing similarity scores: 100%|██████████| 112/112 [00:00<00:00, 1668.90it/s]

Mean execution time for all queries: 0.47 ms





In [29]:
faiss_predictions = [0]*(len(faiss_similarity_scores)+1)
for idx, scores in faiss_similarity_scores.items():
    scores_flattened = [doc for doc, score in scores]
    faiss_predictions[idx] = scores_flattened

In [30]:
faiss_mean_precision_at_k = np.mean(
    [precision_at_k(preds, label) for preds, label in zip(faiss_predictions[1:], ground_truth)])
faiss_mean_recall_at_k = np.mean(
    [recall_at_k(preds, label) for preds, label in zip(faiss_predictions[1:], ground_truth)])
faiss_mean_sps = np.mean(
    [precision_at_k(preds, label, 1) for preds, label in zip(faiss_predictions[1:], ground_truth)])

#### VectorDB - ANNOY

In [32]:
# Load ANNOY index from file
loaded_annoy_vs = Annoy.load_local(
    folder_path="./cisi/ir_techniques/annoy/", 
    embeddings=OpenAIEmbeddings())

In [33]:
# Calculate cosine similarity for each query-document pair
annoy_similarity_scores = {}
execution_times = [] 
for query_id, query in tqdm(queries_openai.items(), desc="Computing similarity scores"):
    start_time = time.time()
    scores = loaded_annoy_vs.similarity_search_with_score_by_vector(query["embedding"], k=100)
    end_time = time.time()  # Record end time
    execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
    execution_times.append(execution_time)

    new_scores = []
    for score in scores:
        new_scores.append((int(score[0].page_content), score[1]))
    annoy_similarity_scores[query_id] = sorted(new_scores, key=lambda x: x[1], reverse=True)

mean_execution_time_annoy = sum(execution_times) / len(execution_times)
print(f"Mean execution time for all queries: {mean_execution_time:.2f} ms")

Computing similarity scores: 100%|███████████| 112/112 [00:00<00:00, 374.41it/s]

Mean execution time for all queries: 2.63 ms





In [34]:
annoy_predictions = [0]*(len(annoy_similarity_scores)+1)
for idx, scores in annoy_similarity_scores.items():
    scores_flattened = [doc for doc, score in scores]
    annoy_predictions[idx] = scores_flattened

In [35]:
annoy_mean_precision_at_k = np.mean(
    [precision_at_k(preds, label) for preds, label in zip(annoy_predictions[1:], ground_truth)])
annoy_mean_recall_at_k = np.mean(
    [recall_at_k(preds, label) for preds, label in zip(annoy_predictions[1:], ground_truth)])
annoy_mean_sps = np.mean(
    [precision_at_k(preds, label, 1) for preds, label in zip(annoy_predictions[1:], ground_truth)])

## RAG Evaluation - Metrics Calculation

### CISI Dataset

#### Load data

In [37]:
# Load arguAna documents & queries

with open("./cisi/dataset/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Documents loaded succesfully.")

with open("./cisi/dataset/queries.pkl", "rb") as f:
    queries = pickle.load(f)

print("Queries loaded succesfully.")

# Load CISI ground truth
with open("./cisi/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Documents loaded succesfully.
Queries loaded succesfully.
Ground truth loaded succesfully.


In [38]:
# Load davinci-0.0.3 responses
with open("./cisi/responses/da-vinci-0.0.3/llm_w_rag_faiss.pkl", "rb") as f:
    da_vinci_llm_w_rag = pickle.load(f)

with open("./cisi/responses/da-vinci-0.0.3/llm_wo_rag.pkl", "rb") as f:
    da_vinci_llm_wo_rag = pickle.load(f)

print("da-vinci-0.0.3 responses with & without RAG loaded succesfully.")

da-vinci-0.0.3 responses with & without RAG loaded succesfully.


In [39]:
# Load gpt-3.5-turbo-instruct responses
with open("./cisi/responses/gpt-3.5-turbo-instruct/llm_w_rag_faiss.pkl", "rb") as f:
    gpt_llm_w_rag_faiss = pickle.load(f)

with open("./cisi/responses/gpt-3.5-turbo-instruct/llm_w_rag_exact_search.pkl", "rb") as f:
    gpt_llm_w_rag_exact_search = pickle.load(f)
    
with open("./cisi/responses/gpt-3.5-turbo-instruct/llm_wo_rag.pkl", "rb") as f:
    gpt_llm_wo_rag = pickle.load(f)

print("gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.")

gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.


In [40]:
# Load llama-7b responses
with open("./cisi/responses/llama-7b/llm_w_rag_faiss.pkl", "rb") as f:
    llama_7b_llm_w_rag = pickle.load(f)

with open("./cisi/responses/llama-7b/llm_wo_rag.pkl", "rb") as f:
    llama_7b_llm_wo_rag = pickle.load(f)

print("llama-7b responses with & without RAG loaded succesfully.")

llama-7b responses with & without RAG loaded succesfully.


In [49]:
# Implement BLEU evaluation function
def compute_bleu(references, candidate):
    smoothing = SmoothingFunction().method5
    return sentence_bleu(references, candidate, smoothing_function=smoothing)

# Implement ROUGE evaluation function
def compute_rouge(references, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    total_score = 0

    # Compute ROUGE for each reference
    for reference in references:
        scores = scorer.score(reference, candidate)
        total_score += scores['rouge1'].fmeasure

    # Calculate average score
    if references:
        average_score = total_score / len(references)
    else:
        average_score = 0
    return average_score



def compute_mean_bleu_score(rag_responses, relevant_docs, K = 10):
    total_bleu_score = 0.0
    num_queries = 0
    for query_id, relevant_docs in ground_truth.items():
        if (query_id not in queries) or (query_id not in rag_responses):
            continue
        query_text = queries[query_id]['text']
        response = rag_responses[query_id]['response']
        if relevant_docs:
            bleu_score = compute_bleu([documents[id]['text'] for id in relevant_docs[:K]], response)
        else:
            bleu_score = 0
        total_bleu_score += bleu_score
        num_queries += 1
        if num_queries == 101:
            break
    mean_bleu_score = total_bleu_score / num_queries
    return mean_bleu_score

def compute_mean_rouge_score(rag_responses, relevant_docs, K = 10):
    total_rouge_score = 0.0
    num_queries = 0
    for query_id, relevant_docs in ground_truth.items():
        if (query_id not in queries) or (query_id not in rag_responses):
            continue
        query_text = queries[query_id]['text']
        response = rag_responses[query_id]['response']
        rouge_score = compute_rouge([documents[id]['text'] for id in relevant_docs[:K]], response)
        total_rouge_score += rouge_score
        num_queries += 1
        if num_queries == 101:
            break
    mean_rouge_score = total_rouge_score / num_queries
    return mean_rouge_score

#### da-vinci-0.0.3

In [50]:
# Compute for davinci with without RAG

cisi_mean_bleu_score_davinci_wo_rag = compute_mean_bleu_score(da_vinci_llm_wo_rag, ground_truth)
cisi_mean_bleu_score_davinci_w_rag = compute_mean_bleu_score(da_vinci_llm_w_rag, ground_truth)
cisi_mean_rouge_score_davinci_wo_rag = compute_mean_rouge_score(da_vinci_llm_wo_rag, ground_truth)
cisi_mean_rouge_score_davinci_w_rag = compute_mean_rouge_score(da_vinci_llm_w_rag, ground_truth)

#### gpt-3.5-turbo-instruct

In [52]:
# Compute for gpt without RAG

cisi_mean_bleu_score_gpt_wo_rag = compute_mean_bleu_score(gpt_llm_wo_rag, ground_truth)
cisi_mean_rouge_score_gpt_wo_rag = compute_mean_rouge_score(gpt_llm_wo_rag, ground_truth)

##### NLA Method (LSI + Truncated SVD) for RAG

In [53]:
cisi_mean_bleu_score_gpt_w_lsi_rag = compute_mean_bleu_score(gpt_llm_w_rag_exact_search, ground_truth)
cisi_mean_rouge_score_gpt_w_lsi_rag = compute_mean_rouge_score(gpt_llm_w_rag_exact_search, ground_truth)

##### SOTA Method (VectorDB i.e. FAISS) for RAG

In [54]:
cisi_mean_bleu_score_gpt_w_faiss_rag = compute_mean_bleu_score(gpt_llm_w_rag_faiss, ground_truth)
cisi_mean_rouge_score_gpt_w_faiss_rag = compute_mean_rouge_score(gpt_llm_w_rag_faiss, ground_truth)

#### llama-7b

In [56]:
# Compute for llama-7b with without RAG

cisi_mean_bleu_score_llama_7b_wo_rag = compute_mean_bleu_score(llama_7b_llm_wo_rag, ground_truth)
cisi_mean_bleu_score_llama_7b_w_rag = compute_mean_bleu_score(llama_7b_llm_w_rag, ground_truth)
cisi_mean_rouge_score_llama_7b_wo_rag = compute_mean_rouge_score(llama_7b_llm_wo_rag, ground_truth)
cisi_mean_rouge_score_llama_7b_w_rag = compute_mean_rouge_score(llama_7b_llm_w_rag, ground_truth)

### arguAna Dataset

#### Load Data

In [58]:
# Load arguAna documents & queries

with open("./arguana/dataset/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Documents loaded succesfully.")

with open("./arguana/dataset/queries.pkl", "rb") as f:
    queries = pickle.load(f)

print("Queries loaded succesfully.")

# Load CISI ground truth
with open("./arguana/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Documents loaded succesfully.
Queries loaded succesfully.
Ground truth loaded succesfully.


In [59]:
# Load gpt-3.5-turbo-instruct responses
with open("./arguana/responses/gpt-3.5-turbo-instruct/llm_w_rag_faiss.pkl", "rb") as f:
    gpt_llm_w_rag_faiss = pickle.load(f)

with open("./arguana/responses/gpt-3.5-turbo-instruct/llm_w_rag_exact_search.pkl", "rb") as f:
    gpt_llm_w_rag_exact_search = pickle.load(f)
    
print("gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.")

gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.


#### gpt-3.5-turbo-instruct

##### NLA Method (LSI + Truncated SVD) for RAG

In [60]:
arguana_mean_bleu_score_gpt_w_lsi_rag = compute_mean_bleu_score(gpt_llm_w_rag_exact_search, ground_truth)
arguana_mean_rouge_score_gpt_w_lsi_rag = compute_mean_rouge_score(gpt_llm_w_rag_exact_search, ground_truth)

##### SOTA Method (VectorDB i.e. FAISS) for RAG

In [61]:
arguana_mean_bleu_score_gpt_w_faiss_rag = compute_mean_bleu_score(gpt_llm_w_rag_faiss, ground_truth)
arguana_mean_rouge_score_gpt_w_faiss_rag = compute_mean_rouge_score(gpt_llm_w_rag_faiss, ground_truth)

### NFCorpus Dataset

#### Load Data

In [63]:
# Load arguAna documents & queries

with open("./nfcorpus/dataset/documents.pkl", "rb") as f:
    documents = pickle.load(f)

print("Documents loaded succesfully.")

with open("./nfcorpus/dataset/queries.pkl", "rb") as f:
    queries = pickle.load(f)

print("Queries loaded succesfully.")

# Load CISI ground truth
with open("./nfcorpus/dataset/rel_set.pkl", "rb") as f:
    ground_truth = pickle.load(f)

print("Ground truth loaded succesfully.")

Documents loaded succesfully.
Queries loaded succesfully.
Ground truth loaded succesfully.


In [64]:
# Load gpt-3.5-turbo-instruct responses
with open("./nfcorpus/responses/gpt-3.5-turbo-instruct/llm_w_rag_faiss.pkl", "rb") as f:
    gpt_llm_w_rag_faiss = pickle.load(f)

with open("./nfcorpus/responses/gpt-3.5-turbo-instruct/llm_w_rag_exact_search.pkl", "rb") as f:
    gpt_llm_w_rag_exact_search = pickle.load(f)
    
print("gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.")

gpt-3.5-turbo-instruct responses with & without RAG loaded succesfully.


#### gpt-3.5-turbo-instruct

##### NLA Method (LSI + Truncated SVD) for RAG

In [65]:
nfcorpus_mean_bleu_score_gpt_w_lsi_rag = compute_mean_bleu_score(gpt_llm_w_rag_exact_search, ground_truth)
nfcorpus_mean_rouge_score_gpt_w_lsi_rag = compute_mean_rouge_score(gpt_llm_w_rag_exact_search, ground_truth)

##### SOTA Method (VectorDB i.e. FAISS) for RAG

In [66]:
nfcorpus_mean_bleu_score_gpt_w_faiss_rag = compute_mean_bleu_score(gpt_llm_w_rag_faiss, ground_truth)
nfcorpus_mean_rouge_score_gpt_w_faiss_rag = compute_mean_rouge_score(gpt_llm_w_rag_faiss, ground_truth)

# Experimental Results

### Main Experiment

In [3]:
from prettytable import PrettyTable


data_cisi = [
    {
        "Dataset": "CISI",
        "Method": "LSI + SVD",
        "Mean BLEU Score": cisi_mean_bleu_score_gpt_w_lsi_rag,
        "Mean ROUGE Score": cisi_mean_rouge_score_gpt_w_lsi_rag,
    },
    {
        "Dataset": "",
        "Method": "FAISS VectorDB",
        "Mean BLEU Score": cisi_mean_bleu_score_gpt_w_faiss_rag,
        "Mean ROUGE Score": cisi_mean_rouge_score_gpt_w_faiss_rag,
    },
]


data_arguAna = [
        {
        "Dataset": "arguAna",
        "Method": "LSI + SVD",
        "Mean BLEU Score": arguana_mean_bleu_score_gpt_w_lsi_rag,
        "Mean ROUGE Score": arguana_mean_rouge_score_gpt_w_lsi_rag,
    },
    {
        "Dataset": "",
        "Method": "FAISS VectorDB",
        "Mean BLEU Score": arguana_mean_bleu_score_gpt_w_faiss_rag,
        "Mean ROUGE Score": arguana_mean_rouge_score_gpt_w_faiss_rag,
    },

]

data_nfcorpus = [
        {
        "Dataset": "nfcorpus",
        "Method": "LSI + SVD",
        "Mean BLEU Score": nfcorpus_mean_bleu_score_gpt_w_lsi_rag,
        "Mean ROUGE Score": nfcorpus_mean_rouge_score_gpt_w_lsi_rag,
    },
    {
        "Dataset": "",
        "Method": "FAISS VectorDB",
        "Mean BLEU Score": nfcorpus_mean_bleu_score_gpt_w_faiss_rag,
        "Mean ROUGE Score": nfcorpus_mean_rouge_score_gpt_w_faiss_rag,
    },

]

combined_data = data_cisi + [{"Dataset": "", "Method": "", "Mean BLEU Score": "", "Mean ROUGE Score": ""}] + data_arguAna + [{"Dataset": "", "Method": "", "Mean BLEU Score": "", "Mean ROUGE Score": ""}] +data_nfcorpus

table = PrettyTable()
table.field_names = ["Dataset", "Method", "Mean BLEU Score", "Mean ROUGE Score"]
table.float_format = "5.5" 

table.title = "RAG Methodology Comparison"

for row in combined_data:
    table.add_row([row["Dataset"], row["Method"], row["Mean BLEU Score"], row["Mean ROUGE Score"]])


print(table)

+----------------------------------------------------------------+
|                   RAG Methodology Comparison                   |
+----------+----------------+-----------------+------------------+
| Dataset  |     Method     | Mean BLEU Score | Mean ROUGE Score |
+----------+----------------+-----------------+------------------+
|   CISI   |   LSI + SVD    |     0.74410     |     0.21670      |
|          | FAISS VectorDB |     0.83770     |     0.22260      |
|          |                |                 |                  |
| arguAna  |   LSI + SVD    |     0.27210     |     0.21890      |
|          | FAISS VectorDB |     0.69370     |     0.27390      |
|          |                |                 |                  |
| nfcorpus |   LSI + SVD    |     0.47090     |     0.17130      |
|          | FAISS VectorDB |     0.60990     |     0.20240      |
+----------+----------------+-----------------+------------------+


### Plain LLMs vs RAG - Sub Experiment 1

In [4]:
from prettytable import PrettyTable

# Data without RAG
data_without_rag = [
    {"Model": "da-vinci", "BLEU Score": cisi_mean_bleu_score_davinci_wo_rag, "ROUGE Score": cisi_mean_rouge_score_davinci_wo_rag},
    {"Model": "gpt-3.5-turbo-instruct", "BLEU Score": cisi_mean_bleu_score_gpt_wo_rag, "ROUGE Score": cisi_mean_rouge_score_gpt_wo_rag},
    {"Model": "LLAMA", "BLEU Score": cisi_mean_bleu_score_llama_7b_wo_rag, "ROUGE Score": cisi_mean_rouge_score_llama_7b_wo_rag},
]

# Data with RAG
data_with_rag = [
    {"Model": "da-vinci", "BLEU Score": cisi_mean_bleu_score_davinci_w_faiss_rag, "ROUGE Score": cisi_mean_rouge_score_davinci_w_faiss_rag},
    {"Model": "gpt-3.5-turbo-instruct", "BLEU Score": cisi_mean_bleu_score_gpt_w_faiss_rag, "ROUGE Score": cisi_mean_rouge_score_gpt_w_faiss_rag},
    {"Model": "LLAMA", "BLEU Score": cisi_mean_bleu_score_llama_7b_w_faiss_rag, "ROUGE Score": cisi_mean_rouge_score_llama_7b_w_faiss_rag},
]

# Create tables
table_without_rag = PrettyTable()
table_without_rag.field_names = ["Model", "Mean BLEU Score", "Mean ROUGE Score"]

for row in data_without_rag:
    table_without_rag.add_row([row["Model"], row["BLEU Score"], row["ROUGE Score"]])

table_with_rag = PrettyTable()
table_with_rag.field_names = ["Model", "Mean BLEU Score", "Mean ROUGE Score"]

for row in data_with_rag:
    table_with_rag.add_row([row["Model"], row["BLEU Score"], row["ROUGE Score"]])

# Print tables
print("Results without RAG:")
print(table_without_rag)

print("\nResults with RAG:")
print(table_with_rag)


Results without RAG:
+------------------------+-----------------+------------------+
|         Model          | Mean BLEU Score | Mean ROUGE Score |
+------------------------+-----------------+------------------+
|        da-vinci        |      0.6094     |      0.1603      |
| gpt-3.5-turbo-instruct |      0.7937     |      0.2425      |
|         LLAMA          |      0.7199     |      0.2061      |
+------------------------+-----------------+------------------+

Results with RAG:
+------------------------+-----------------+------------------+
|         Model          | Mean BLEU Score | Mean ROUGE Score |
+------------------------+-----------------+------------------+
|        da-vinci        |      0.8377     |      0.2226      |
| gpt-3.5-turbo-instruct |      0.8761     |      0.2982      |
|         LLAMA          |      0.7852     |      0.2412      |
+------------------------+-----------------+------------------+


### Information Retrieval - Sub Experiment 2

In [2]:
from prettytable import PrettyTable
import random

data = [
    {
        "Method": "LSI + SVD",
        "mean_precision_at_k": mean_precision_at_k_lsi,
        "mean_recall_at_k": mean_recall_at_k_lsi,
        "mean_sps": mean_sps_lsi,
        "mean_execution_time": mean_execution_time_lsi
    },
    {
        "Method": "IR with openAI embeddings",
        "mean_precision_at_k": mean_precision_at_k_openai,
        "mean_recall_at_k": mean_recall_at_k_lsi_openai,
        "mean_sps": mean_sps_openai,
        "mean_execution_time": mean_execution_time_openai
    },
    {
        "Method": "VectorDB - ANNOY",
        "mean_precision_at_k": annoy_mean_precision_at_k,
        "mean_recall_at_k": annoy_mean_recall_at_k,
        "mean_sps": annoy_mean_sps,
        "mean_execution_time": mean_execution_time_annoy
    },
    {
        "Method": "VectorDB - FAISS",
        "mean_precision_at_k": faiss_mean_precision_at_k,
        "mean_recall_at_k": faiss_mean_recall_at_k,
        "mean_sps": faiss_mean_sps,
        "mean_execution_time": mean_execution_time_faiss
    },
]

table = PrettyTable()
table.field_names = ["Method", "Mean Precision @ k", "Mean Recall @ k", "Mean SPS", "Mean Execution Time (ms)"]
table.float_format = "5.5"  

table.title = "Note: k = 10"

for row in data:
    table.add_row([row["Method"], row["mean_precision_at_k"], row["mean_recall_at_k"], row["mean_sps"], row["mean_execution_time"]])

# Print table
print(table)


+--------------------------------------------------------------------------------------------------------+
|                                              Note: k = 10                                              |
+---------------------------+--------------------+-----------------+----------+--------------------------+
|           Method          | Mean Precision @ k | Mean Recall @ k | Mean SPS | Mean Execution Time (ms) |
+---------------------------+--------------------+-----------------+----------+--------------------------+
|         LSI + SVD         |      0.82679       |     0.00266     | 0.84821  |        1467.34520        |
| IR with openAI embeddings |      0.97054       |     0.00312     | 0.98214  |        361.41928         |
|      VectorDB - ANNOY     |      0.97515       |     0.00203     | 0.98268  |         2.15972          |
|      VectorDB - FAISS     |      0.97029       |     0.00354     | 0.97257  |         0.68343          |
+---------------------------+--------