In [20]:
from ragatouille import RAGPretrainedModel
import json
from tqdm import tqdm 
import jsonlines
import numpy as np

In [2]:
RAG = RAGPretrainedModel.from_index(".ragatouille/colbert/indexes/wiki-knowns-1000-full")

In [4]:
knowns = json.load(open("known_1000.json"))
answers = [k['attribute'] for k in knowns]
queries = [k['prompt'] for k in knowns]

In [33]:
def precision_at_K(documents, answer, K=3):
    num_relevant = 0
    for i in range(min(len(documents), K)):
        # print(documents[i]['content'])
        if answer in documents[i]['content']:
            num_relevant+=1
    return num_relevant/K

In [7]:
def fetch_top_K(query, K=10):
    results = RAG.search(query=query)
    return results[:min(K, len(results))]

In [9]:
retrieved_docs = []

In [10]:
for q in tqdm(queries):
    docs = fetch_top_K(q)
    retrieved_docs.append({"query": q, "documents": docs})

100%|██████████| 1209/1209 [00:15<00:00, 78.86it/s]


In [34]:
precisions = {3: [], 5: [], 10:[]}
for i,r in tqdm(enumerate(retrieved_docs)):
    precisions[3].append(precision_at_K(r['documents'], answers[i], K=3))

1209it [00:00, 333371.48it/s]


In [36]:
for i,r in tqdm(enumerate(retrieved_docs)):
    precisions[5].append(precision_at_K(r['documents'], answers[i], K=5))

1209it [00:00, 245078.22it/s]


In [37]:
for i,r in tqdm(enumerate(retrieved_docs)):
    precisions[10].append(precision_at_K(r['documents'], answers[i], K=10))

1209it [00:00, 150058.10it/s]


In [38]:
np.mean(precisions[3])

0.544251447477254

In [39]:
np.mean(precisions[5])

0.4855252274607113

In [40]:
np.mean(precisions[10])

0.41356492969396197