In [1]:
from constants import Split
from data.datastructures.hyperparameters.dpr import DenseHyperParams
from retriever.Contriever import Contriever
from data.loaders.RetrieverDataset import RetrieverDataset
from metrics.SimilarityMatch import CosineSimilarity as CosScore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader = RetrieverDataset("wikimultihopqa", "wiki-musiqueqa-corpus", "evaluation/config.ini", Split.DEV)
queries, qrels, corpus = loader.qrels()

print("info: ", len(queries), len(qrels), len(corpus), queries[0])

Loading passages: 100%|██████████| 563424/563424 [00:00<00:00, 1423640.19it/s]
Transforming passage dataset: 100%|██████████| 563424/563424 [00:00<00:00, 632210.41it/s]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Harley-Davidson Harley-Davidson
KeysView(<Section: Data-Path>)
12576


100%|██████████| 1200/1200 [01:28<00:00, 13.60it/s]


Dataset loaded of length 12000
info:  1200 1200 563424 <data.datastructures.question.Question object at 0x3b84b0c70>


In [None]:
config_instance = DenseHyperParams(query_encoder_path="facebook/contriever",
                                   document_encoder_path="facebook/contriever",
                                   batch_size=32)
tasb_search = Contriever(config_instance)
similarity_measure = CosScore()
response = tasb_search.retrieve(corpus, queries, 100, similarity_measure)
# print(response)
print("indices",len(response))

In [None]:
from metrics.retrieval.RetrievalMetrics import RetrievalMetrics

metrics = RetrievalMetrics(k_values=[1,5,10])
print(metrics.evaluate_retrieval(qrels=qrels, results=response))

In [None]:
# Understand queries
print(queries[0].id())
print(queries[0].text())

# Qrels
qrel1 = next(iter(qrels.items()))
test_query_id, test_context_ids = qrel1

print(test_query_id)
print(test_context_ids)

# Corpus
print(corpus[401226].text())

In [None]:
next(iter(response.items()))

In [None]:
import numpy as np

def expand_contexts(top_k=5, n_additional_docs=10, strategy="top_k_plus_hard_negative_docs"):
    q_contexts = {}
    
    for q_id, contexts in response.items():
        context_ids = list(map(int, contexts.keys()))
        q_contexts[q_id] = context_ids[:top_k]
        if strategy == "top_k_plus_random_docs":
            # Sample n_additional_docs random docs. If one of them is already in the top k, sample again
            while len(q_contexts[q_id]) < top_k + n_additional_docs:
                random_doc = np.random.choice(len(corpus), 1)[0]
                if random_doc not in context_ids:
                    q_contexts[q_id].append(random_doc)
                    
        elif strategy == "top_k_plus_hard_negative_docs":
            # Sample n_additional_docs hard negative docs from context_ids. A hard negative doc is a doc that is relevant to another query
            # but not to the current query and it is not present in the current query's ground truth
            gt_contexts = list(map(int, qrels[q_id].keys()))
            # Make a new list of context_ids that are not in the current query's ground truth. Use filter
            context_ids = list(filter(lambda x: x not in gt_contexts, context_ids))
            # Concat n_additional_docs hard negative docs
            q_contexts[q_id] += context_ids[:n_additional_docs]
            
    return q_contexts

In [None]:
# strategy = "top_k_plus_hard_negative_docs"
strategy = "top_k_plus_hard_negative_docs"
n_additional_docs = 10
top_k = 5

q_contexts = expand_contexts(top_k, n_additional_docs, strategy)
print(q_contexts)
# Save q_contexts to a file
np.save("exp_3_4/{}__top_k{}__add_docs_{}.npy".format(strategy, top_k, n_additional_docs), q_contexts)

In [None]:
q_contexts_hn = q_contexts

np.save("exp_3_4/{}__top_k{}__add_docs_{}.npy".format("top_k_plus_hard_negative_docs", top_k, n_additional_docs), q_contexts_hn)

In [None]:
q_contexts_rd = expand_contexts(top_k, n_additional_docs, "top_k_plus_random_docs")

np.save("exp_3_4/{}__top_k{}__add_docs_{}.npy".format("top_k_plus_random_docs", top_k, n_additional_docs), q_contexts_rd)

In [3]:
from ollama import generate, create
import json
import tqdm

# Prompt construction
modelfile = """
FROM llama3:8b
PARAMETER temperature 0.5
"""

create(model="llama3-custom", modelfile=modelfile)


def create_prompt(question, context_ids):
    prompt = "Please answer the given question based on the given contexts below.\n"
    i = 1
    for id in context_ids:
        prompt += f"Context {i}: " + corpus[id].text() + "\n"
        i += 1
    prompt += "Question: " + question + "\n"
    prompt += "Constraint: Don't give any explanations and use MAX 5 tokens in your response. No yapping.\n"
    return prompt

def eval(retrived_contexts, test_data):
    print("Starting evaluation...")
    
    results = []
    exact_matches = 0
    ground_truth = ""
    question = ""
    i = 0
    for question_id, contexts_list in tqdm.tqdm(retrived_contexts.items()):
        context_ids = contexts_list
    
        for que in test_data:
            if que["_id"] == question_id:
                question = que["question"]
                ground_truth = que["answer"]
        prompt = create_prompt(question, context_ids)
        prediction = generate(model="llama3-custom", prompt=prompt)
        
        results.append({
            "q_id": question_id, 
            "prediction": prediction["response"], 
            "ground_truth": ground_truth
        })
        
        if prediction["response"] == ground_truth:
            exact_matches += 1
        
        # if i == 5:
        #     return results
        i+=1
            
    accuracy = exact_matches / len(test_data)
    results_info = {"exact_matches": exact_matches, "accuracy": accuracy}
    # with open(f"top_{top_k}.json", "w") as f:
    #     json.dump(results_info, f)
    print(results_info)
    return results

In [None]:
# Keep only the first 10 contexts for each query in q_contexts_hn
q_contexts_hn_10 = {k: v[:13] for k, v in q_contexts_hn.items()}
q_contexts_rd_10 = {k: v[:13] for k, v in q_contexts_rd.items()}

# Save
np.save("exp_3_4/top_k_plus_hard_negative_docs__top_k5__add_docs_8.npy", q_contexts_hn_10)
np.save("exp_3_4/top_k_plus_random_docs__top_k5__add_docs_8.npy", q_contexts_rd_10)

In [4]:
import numpy as np
import json

with open("dataset/data/dev.json", "r") as file:
    test_data = json.load(file)

# with open("dataset/wiki_musique_corpus.json", "r") as file:
#     context_data = json.load(file)

# with open("context_indices.json", "r") as file:
#     context_ids = json.load(file)

# Load q_contexts_hn
q_contexts_hn = np.load("exp_3_4/top_k_plus_hard_negative_docs__top_k5__add_docs_2.npy", allow_pickle=True).item()

# Load q_contexts_rd
q_contexts_rd = np.load("exp_3_4/top_k_plus_random_docs__top_k5__add_docs_2.npy", allow_pickle=True).item()

eval_results = eval(q_contexts_hn, test_data)
# eval(context_ids, context_data, test_data, 7)
# print(next(iter(context_ids.items())))

# type(q_contexts_hn)

Starting evaluation...


 12%|█▏        | 146/1200 [03:58<1:04:03,  3.65s/it]

In [None]:
print(eval_results)
# Save eval_results to a file

with open("exp_3_4/eval_results_top_k_plus_hard_negative_docs__top_k5__add_docs_2.json", "w") as f:
    json.dump(eval_results, f)


# Metrics

In [None]:
import evaluate
from collections import Counter
import re

exact_match = evaluate.load("exact_match")
def calculate_metrics(prediction, ground_truth):
    prediction_tokens = re.findall(r"\w+", prediction.lower())
    ground_truth_tokens = re.findall(r"\w+", ground_truth.lower())

    common_tokens = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common_tokens = sum(common_tokens.values())

    if len(prediction_tokens) == 0:
        precision = 0.0
    else:
        precision = num_common_tokens / len(prediction_tokens)

    if len(ground_truth_tokens) == 0:
        recall = 0.0
    else:
        recall = num_common_tokens / len(ground_truth_tokens)

    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1

In [None]:
# Load eval_results from exp_3_4 folder
with open("exp_3_4/eval_results_top_k_plus_hard_negative_docs__top_k5__add_docs_10.json", "r") as f:
    eval_results_hn = json.load(f)

# Load eval_results from exp_3_4 folder
with open("exp_3_4/eval_results_top_k_plus_random_docs__top_k5__add_docs_10.json", "r") as f:
    eval_results_rd = json.load(f)

In [None]:
# Make a list of predictions and ground truths
predictions = []
ground_truths = []
for result in eval_results_hn:
    predictions.append(result["prediction"])
    ground_truths.append(result["ground_truth"])

avg_exact_match = exact_match.compute(
    predictions=predictions,
    references=ground_truths,
    ignore_case=True,
    ignore_punctuation=True,
)

precisions, recalls, f1s = [], [], []
for i in range(len(predictions)):
    precision, recall, f1 = calculate_metrics(predictions[i], ground_truths[i])
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)
avg_f1 = sum(f1s) / len(f1s)

In [None]:
metrics = {
    "avg_exact_match": avg_exact_match["exact_match"],
    "avg_precision": avg_precision,
    "avg_recall": avg_recall,
    "avg_f1": avg_f1,
}

with open("exp_3_4/metrics_top_k_plus_hard_negative_docs__top_k5__add_docs_10.json", "w") as fp:
    json.dump(metrics, fp)

metrics