## Evaluate RAGatouille Information Retrieval (IR)
based on ColBERT


In [None]:
# import project from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# provide project root path
ProjectRoot = "/content/drive/MyDrive/UMich Capstone/NoteBooks/"
DatasetRoot = ProjectRoot + "Dataset/"

#### Dependencies

In [None]:
try:
    import chromadb
except ImportError:
    !pip install chromadb

In [None]:
import chromadb
import json
import regex as re
import numpy as np
import pandas as pd

#### Init IR

In [None]:
# loading full article from json file
with open(DatasetRoot + 'raw_knowledge.json', 'r') as f:
    raw_text_json = json.load(f)
raw_text_list = [text for _, text in raw_text_json.items()]

raw_text_ids = [idx for idx, _ in raw_text_json.items()]

In [None]:
# setup Chroma in-memory
client = chromadb.Client()

Retriever = client.create_collection("knowledge-store")
# Add docs to the knowledge store
Retriever.add(
    documents = raw_text_list,
    ids=raw_text_ids
)

In [None]:
#query = "what does Data analysis involve?"
query = "what has nate silver said in the past?"

# retrieve top doc match
results = Retriever.query(
    query_texts= query,
    n_results=2
)
results['documents']

[['In 2012, technologists Thomas H. Davenport and DJ Patil declared "Data Scientist: The Sexiest Job of the 21st Century", a catchphrase that was picked up even by major-city newspapers like the New York Times and the Boston Globe. A decade later, they reaffirmed it, stating that "the job is more in demand than ever with employers".',
  'Data science is "a concept to unify statistics, data analysis, informatics, and their related methods" to "understand and analyze actual phenomena" with data. It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, information science, and domain knowledge. However, data science is different from computer science and information science. Turing Award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational, and now data-driven) and asserted that "everything about science is changing because of the impact of information technology" and t

#### Eval Retrieval Precision, Recall, MRR

In [None]:
# load context-question train set which was created by doc2query
train_df = pd.read_csv(DatasetRoot + '/q_a_trainset.csv')

In [None]:
def evaluate_retrieval(retriever, eval_dataset, top_n=3):
    precision_at_k = []
    recall_at_k = []
    mrr = []

    for _, eval_data in eval_dataset.iterrows():

        # get query and raw paragraph ID from where question was generated.
        # This paragraph will be treated as ground truth doc
        raw_para_id = eval_data['raw_para_id']
        query = eval_data['question']

        # search most relevant docs
        retrieved_docs = Retriever.query(query_texts= query, n_results=top_n)
        doc_indices = [int(idx) for idx in retrieved_docs['ids'][0]]

        relevant_docs = set([raw_para_id])
        retrieved_docs = set(doc_indices)

        hits = relevant_docs & retrieved_docs
        '''
        if len(hits) > 0:

            print(f"raw para id: {raw_para_id} ...")
            print(f"fetch_id: {doc_indices} ...")
            print(f"question: {query} ...")
            print(f"\n")
        '''

        precision = len(hits) / len(retrieved_docs)
        recall = len(hits) / len(relevant_docs)

        precision_at_k.append(precision)
        recall_at_k.append(recall)

        reciprocal_rank = 0.0
        for rank, doc_index in enumerate(doc_indices, start=1):
            if doc_index in relevant_docs:
                reciprocal_rank = 1.0 / rank
                break
        mrr.append(reciprocal_rank)

    avg_precision = np.mean(precision_at_k)
    avg_recall = np.mean(recall_at_k)
    avg_mrr = np.mean(mrr)

    return avg_precision, avg_recall, avg_mrr



In [None]:
precision, recall, mrr = evaluate_retrieval(Retriever, train_df[['raw_para_id', 'question']], top_n=1)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, MRR: {mrr:.4f}")

Precision: 0.4375, Recall: 0.4375, MRR: 0.4375


In [None]:
precision, recall, mrr = evaluate_retrieval(Retriever, train_df[['raw_para_id', 'question']], top_n=3)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, MRR: {mrr:.4f}")

Precision: 0.2396, Recall: 0.7188, MRR: 0.5625


In [None]:
precision, recall, mrr = evaluate_retrieval(Retriever, train_df[['raw_para_id', 'question']], top_n=5)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, MRR: {mrr:.4f}")

Precision: 0.1708, Recall: 0.8542, MRR: 0.5943
