## NLP Assignment: RAGs For Open Domain Complex QA


Henry: stuff for training on colab, do not delete pls :D

In [None]:
# Google colab does not save your files and will delete them when your session terminates, persist models in google drive.
# from google.colab import drive
# import shutil
# drive.mount('/content/drive')

In [None]:
# Google Colab Execution Steps
# STEPS TO RUN ON GOOGLE COLAB
# 1.) Select the ipynb file as your notebook
# 2.) Change your runtime to the free tier GPU
# upload a zip file of this project called NLPProject.zip under '/content' which is the working directory of the notebook
# 3.) run this script
# !rm -R sample_data/
# !unzip NLPProject.zip
# !rm -rf NLPProject/rag_assignment.ipynb
# !mv NLPProject/* .
# !rm -R NLPProject
# !rm -rf NLPProject.zip
# !pip install -e .
# shutil.copy('/content/drive/MyDrive/ColabFiles/index.joblib','/content/indices/adore/corpus')
# shutil.copy('/content/drive/MyDrive/ColabFiles/dev.json','/content/data')
# shutil.copy('/content/drive/MyDrive/ColabFiles/test.json','/content/data')
# shutil.copy('/content/drive/MyDrive/ColabFiles/train.json','/content/data')
# shutil.copy('/content/drive/MyDrive/ColabFiles/wiki_musique_corpus.json','/content/data')


In [None]:
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.retriever.dense.ADORERetriever import ADORERetriever
from dexter.retriever.dense.Contriever import Contriever
from dexter.config.constants import Split
from typing import List
from typing import Dict
from dexter.data.datastructures.question import Question
from dexter.data.datastructures.evidence import Evidence
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity as CosScore, DotScore
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
from dexter.utils.metrics.CoverExactMatch import CoverExactMatch
from tqdm import tqdm
import torch
import random
import csv

In [None]:
# Check torch version and make sure cuda is enabled and available
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)


In [None]:
# -----------------------------------------
# Since china doesn't have access to huggingface, I have manually downloaded the model, feel free to comment this.
# from transformers import AutoTokenizer, AutoModel
# #
# # Contriever taken from huggingface
# model_path = 'huggingface/contriever'
# tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
# model = AutoModel.from_pretrained(model_path, local_files_only=True)
#
# # Llama taken from huggingface
# model_name = "huggingface/llama"

# -----------------------------------------
# But uncomment these two
model_name = "meta-llama/Llama-2-7b-chat-hf"
model_path = "facebook/contriever"

### Load the Dataset and set up contriever


In [None]:
queries: List[Question]
qrels: Dict[str, Dict[str, int]] # qrels[question id][evidence id] = 1 if relevant
corpus: List[Evidence]

config_instance = DenseHyperParams(query_encoder_path=model_path,
                                    document_encoder_path=model_path
                                    ,batch_size=32)

# Load dataset with dev set
loader = RetrieverDataset("wikimultihopqa","wiki_musique_corpus","config.ini",Split.DEV,tokenizer=None)
queries, qrels, corpus = loader.qrels()

# Extract ground truth answers for the questions
raw_data = loader.base_dataset.raw_data
question_ground_truth_answer_map = {sample.question.id(): sample.answer.text() for sample in raw_data}

# Setup contriever
con = Contriever(config_instance)

### Setup LLM

In [None]:
from dexter.llms.llama_engine import LlamaEngine
# Setup LLM
llm_instance = LlamaEngine(data="", model_name=model_name, temperature=0.3, top_n=1)

# Code to query llm
def query_llm(question_text: str, evidences: List[Evidence]):
    """
    :param question_text: question text
    :param evidences: list of evidences
    :return: the answer or None if no answer
    """
    evidence_text = "\n".join(doc.text() for doc in evidences)
    system_prompt = "Follow the given examples and Given the question and context output final answer for the question using information in the context and give answer in form of  [Final Answer]: \n"
    user_prompt = f"Question: {question_text}\nContext: {evidence_text}\nAnswer:"

    chain_answer = llm_instance.get_llama_completion(system_prompt, user_prompt)

    # Parse the response
    if "not possible" in chain_answer.lower() or "unknown" in chain_answer.lower():
        return None
    elif "[Final Answer]:" in chain_answer:
        answer = chain_answer.split("[Final Answer]:")[-1].strip()
        return answer

    return None

In [None]:
def evaluate_query(query, non_relevant_doc_ids_of_query, top_k_similar_doc_ids_of_query):
    """
    Evaluates a single query by selecting relevant and non-relevant documents,
    combining them, passing them to the LLM, and comparing the answer to the ground truth.

    :param query: The query to be evaluated.
    :param non_relevant_doc_ids_of_query: The ids of the non-relevant documents.
    :param top_k_similar_doc_ids_of_query: The ids of the similar documents.
    :return: A boolean ismatch indicating whether the evaluation is correct.
    """
    # Fetch the actual similar documents (not just ids)
    top_k_similar_docs_of_query = [doc for doc in corpus if doc.id() in top_k_similar_doc_ids_of_query]

    # Convert selected doc IDs to actual documents
    non_relevant_docs = []
    if len(non_relevant_doc_ids_of_query) != 0:
        non_relevant_docs = [doc for doc in corpus if doc.id() in non_relevant_doc_ids_of_query]

    # Combine relevant and selected non-relevant docs
    combined_docs = top_k_similar_docs_of_query + non_relevant_docs
    random.shuffle(combined_docs)

    # Evaluate the LLM answer
    answer = query_llm(query.text(), combined_docs)
    ground_truth_answer = question_ground_truth_answer_map[query.id()]

    # Check if the answer matches the ground truth
    is_match = answer is not None and ground_truth_answer.lower() in answer.lower()

    # Write the result to the CSV file
    relevant_text = "\n".join(doc.text() for doc in top_k_similar_docs_of_query)
    non_relevant_text = "\n".join(doc.text() for doc in non_relevant_docs)
    row = [query.text(), ground_truth_answer, answer, is_match, relevant_text, non_relevant_text]
    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(row)

    # print("Answer: ", answer)
    # print("Ground truth answer: ", ground_truth_answer)
    # print("Is match: ", is_match)

    return is_match

In [None]:
def evaluate_with_docs(top_k, top_k_similar_docs_by_query, non_relevant_doc_ids_by_query, num_docs_to_sample):
    """
    General evaluation function for both random and hard negatives. Take relevant and non-relevant docs for each query and feed them to the llm.
    """
    matches = 0
    mismatches = 0

    print(f"There are {top_k} relevant docs and {num_docs_to_sample} non-relevant docs per query")

    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        header = ['Question', 'Ground Truth Answer', 'LLM Answer', 'Match', 'Relevant Documents', 'Non-Relevant Documents']
        writer.writerow(header)

    for query in tqdm(queries, desc="Evaluating queries", unit="query"):
        top_k_similar_doc_ids_of_query = set(top_k_similar_docs_by_query[query.id()].keys())
        non_relevant_doc_ids_of_query = non_relevant_doc_ids_by_query[query.id()]
        is_match = evaluate_query(query, non_relevant_doc_ids_of_query, top_k_similar_doc_ids_of_query)
        if is_match:
            matches += 1
        else:
            mismatches += 1
    return matches, mismatches

### Retrieve all "relevant" docs based on similarity

In [None]:
def get_top_k_from_retrieved(retrieved: Dict[str, Dict[str, float]], top_k: int) -> Dict[str, Dict[str, float]]:
    """
    Extract the top-k documents for each query from the retrieved results.
    This is to avoid calling con.retrieve multiple times.
    """
    top_k_results = {}
    for query_id, docs_scores in retrieved.items():
        sorted_docs = sorted(docs_scores.items(), key=lambda x: x[1], reverse=True)
        top_k_results[query_id] = {doc_id: score for doc_id, score in sorted_docs[:top_k]}
    return top_k_results

In [None]:
similarity_measure = CosScore()

# Calculate similarities for all queries and docs
similar_docs_by_query = con.retrieve(corpus, queries, top_k=1, score_function=similarity_measure, chunk=True, chunksize=400)

## Task 1
Use off the shelf retriever (contriever) and extract contexts for each query to be given as input to a generative model. Use Exact Match or cover Exact Match as metric for evaluating generated answers. Experiment with k=1,3,5 for retrieving top-k contexts and report the performance on generating answers.

In [None]:
k_values = [1, 3, 5]
metrics = RetrievalMetrics(k_values=k_values)
k = max(k_values)
# Evaluate top_k contexts with Exact Match, use previously retrieved results
def evaluate_with_top_k(retrieved, top_k: int) -> Dict[str, Dict[str, float]]:
    response = get_top_k_from_retrieved(retrieved, top_k)
    print("Indices retrieved:", len(response))
    print(metrics.evaluate_retrieval(qrels=qrels, results=response))
    return response

In [None]:
top_k_docs_by_query: Dict[str, Dict[str, float]] = evaluate_with_top_k(similar_docs_by_query, k)

In [None]:
top_k_values = [1, 3, 5]

for top_k in top_k_values:
    output_file = f'llm_relevant_results-{top_k}-0.csv'
    print(f"Running for top_k={top_k}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    non_relevant_ignore_for_here = {query.id : list() for query in queries} #compute_all_non_relevant(0, False)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, non_relevant_ignore_for_here, num_docs_to_sample=0)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

### Task 2
Repeat the above experiment without the retriever, using only oracle contexts as input. Oracle
contexts are annotated documents provided for each question in dev.json.

In [None]:
def evaluate_query_task_2(query, orcal_contexts, sim_metric, output_file):
    # Evaluate the LLM answer
    answer = query_llm(query.text(), orcal_contexts)
    ground_truth_answer = question_ground_truth_answer_map[query.id()]

    if ground_truth_answer is None:
        ground_truth_answer = "None"

    # Check if the answer matches the ground truth
    is_match = sim_metric.evaluate(ground_truth_answer, answer)

    # Write the result to the CSV file
    relevant_text = "\n".join(doc.text() for doc in orcal_contexts)
    row = [query.text(), ground_truth_answer, answer, is_match, relevant_text]
    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(row)

    print("Answer: ", answer)
    print("Ground truth answer: ", ground_truth_answer)
    print("Is match: ", is_match)

    return is_match

def get_oracle_contexts(query, qrels, corpus) -> List[Evidence]:
    query_id = str(query.id())  # Get the query ID as a string
    if query_id not in qrels:
        raise ValueError(f"No relevant documents found for query ID {query_id} in qrels.")

    # Get relevant document IDs for the query
    relevant_doc_ids = qrels[query_id]

    # Fetch the corresponding evidence texts from the corpus
    oracle_contexts = [corpus[int(doc_id)] for doc_id in relevant_doc_ids.keys()]

    return oracle_contexts

In [None]:
output_file = f'llm_task2_orcal_results.csv'
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    header = ['Question', 'Ground Truth Answer', 'LLM Answer', 'Match', 'Relevant Documents']
    writer.writerow(header)

matches = 0
mismatches = 0
sim_metric = CoverExactMatch()
for query in tqdm(queries, desc="Evaluating queries", unit="query"):
    oracle_contexts = get_oracle_contexts(query, qrels, corpus)
    is_match = evaluate_query_task_2(query, oracle_contexts, sim_metric, output_file)
    if is_match:
        matches += 1
    else:
        mismatches += 1

print("Matches:", matches)
print("Mismatches:", mismatches)
print("precision:", matches / (matches + mismatches))

### Task 3 & 4
Now randomly sample documents from the collection that are not relevant to the current query during inference on the evaluation set. Combine these documents with the top-k relevant documents and use them as input to the LLM for answering a query. You can decide the ratios to mix the relevant and the random documents that serve as noise. Analyze the performance.

In [None]:
def compute_all_non_relevant(num_docs_to_sample, is_hard_negatives):
    """
    Computes all non-relevant docs (aka random or hard negatives) for a list of queries and a corpus.
    """
    all_non_relevant = {}

    for query in tqdm(queries, desc=f"Retrieving {num_docs_to_sample} non-relevant docs for each query", unit="query"):
        query_id = query.id()

        # Extract non-relevant document IDs based on qrels for this query
        if num_docs_to_sample == 0:
            all_non_relevant[query_id] = list()
            continue
        non_relevant_doc_ids = {doc.id() for doc in corpus if not (doc.id() in qrels[query_id])}

        if is_hard_negatives:
            most_similar_docs = get_top_k_from_retrieved(similar_docs_by_query, 100)
            # Get similar and nonrelevant docs, remove the ones with similarity 0
            similar_and_non_relevant_doc_ids = {doc_id for doc_id in most_similar_docs[query_id] if most_similar_docs[query_id][doc_id] > 0 and doc_id in non_relevant_doc_ids}
            non_relevant_doc_ids = similar_and_non_relevant_doc_ids

        if num_docs_to_sample > len(non_relevant_doc_ids):
            print("Not enough documents to sample from, please select smaller ratio or reduce the number of similar docs per query to select")
            return None
        all_non_relevant[query_id] = list(non_relevant_doc_ids)[:num_docs_to_sample]

    return all_non_relevant


In [None]:
# Try with only relevant contexts
samples_values = [12, 24]
for top_k in samples_values:
    output_file = f'llm_relevant_results-{top_k}-0.csv'
    print(f"Running for top_k={top_k}, num_other_docs_to_sample=0")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    other_doc_ids_by_query = compute_all_non_relevant(0, False)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, other_doc_ids_by_query, num_docs_to_sample=0)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

### Task 3

In [None]:
# Try with only random contexts
samples_values = [12, 24]
top_k = 0
for num_random_docs_to_sample in samples_values:
    output_file = f'llm_random_results-{top_k}-{num_random_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_random_docs_to_sample={num_random_docs_to_sample}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    random_doc_ids_by_query = compute_all_non_relevant(num_random_docs_to_sample, False)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, random_doc_ids_by_query, num_random_docs_to_sample)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

In [None]:
# Pairs of total 12: 3+9, 6+6, 9+3
top_k_values = [3, 6, 9]
samples_values = [9, 6, 3]
# Evaluate with random context
for top_k, num_random_docs_to_sample in zip(top_k_values, samples_values):
    output_file = f'llm_random_results-{top_k}-{num_random_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_random_docs_to_sample={num_random_docs_to_sample}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    random_doc_ids_by_query = compute_all_non_relevant(num_random_docs_to_sample, False)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, random_doc_ids_by_query, num_random_docs_to_sample)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

In [None]:
# Pairs of total 24: 6+18, 12+12, 18+6
top_k_values = [6, 12, 18]
samples_values = [18, 12, 6]
# Evaluate with random context
for top_k, num_random_docs_to_sample in zip(top_k_values, samples_values):
    output_file = f'llm_random_results-{top_k}-{num_random_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_random_docs_to_sample={num_random_docs_to_sample}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    random_doc_ids_by_query = compute_all_non_relevant(num_random_docs_to_sample, False)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, random_doc_ids_by_query, num_random_docs_to_sample)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

### Task 4
 In this step, we will adopt a more principled approach to sample negative documents to be used as input to the RAG setup. Using a retrieval model, sample hard negatives from the collection for the
current query instead of random documents to inject as noise. hard negatives are documents that are related and close to the query in the vector space but do not help answer the question. This can be sampled by retrieving documents not in the list of ground truth documents for a query as measure by dot product.

In [None]:
# Add only hard negatives
# Try with only contexts
samples_values = [12, 24]
top_k = 0
for num_hard_neg_docs_to_sample in samples_values:
    output_file = f'llm_hard_neg_results-{top_k}-{num_hard_neg_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_hard_neg_docs_to_sample={num_hard_neg_docs_to_sample}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    hard_negative_doc_ids_by_query = compute_all_non_relevant(num_hard_neg_docs_to_sample, True)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, hard_negative_doc_ids_by_query, num_hard_neg_docs_to_sample)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

In [None]:
# Hard negatives are the documents both in retrieved docs (they are similar) and non-relevant docs (since they are not actually relevant)
# Evaluate with hard negative contexts
# Pairs of total 12: 3+9, 6+6, 9+3
top_k_values = [3, 6, 9]
samples_values = [9, 6, 3]

for top_k, num_hard_negatives_to_sample in zip(top_k_values, samples_values):
    output_file = f'llm_hard_neg_results-{top_k}-{num_hard_negatives_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_hard_neg_docs_to_sample={num_hard_negatives_to_sample}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    hard_negative_doc_ids_by_query = compute_all_non_relevant(num_hard_negatives_to_sample, True)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, hard_negative_doc_ids_by_query, num_hard_negatives_to_sample)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

In [None]:
# Pairs of total 24: 6+18, 12+12, 18+6
top_k_values = [6, 12, 18]
samples_values = [18, 12, 6]
for top_k, num_hard_negatives_to_sample in zip(top_k_values, samples_values):
    output_file = f'llm_hard_neg_results-{top_k}-{num_hard_negatives_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_hard_neg_docs_to_sample={num_hard_negatives_to_sample}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    hard_negative_doc_ids_by_query = compute_all_non_relevant(num_hard_negatives_to_sample, True)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, hard_negative_doc_ids_by_query, num_hard_negatives_to_sample)

    print("Matches:", matches)
    print("Mismatches:", mismatches)

### Task 5
Train a retrieval model using ADORE [14]. ADORE is optimized with hard negatives in a dense retrieval setup. Hence, it may be able to discern more relevant documents from large collections and lead to improved downstream answer generation performance. Using this retriever, retrieve relevant contexts followed by answer generation using LLMs. Compare it to the baseline performance of contriever based LLM QA mentioned in step 2 above.

In [None]:
adore_config = DenseHyperParams(query_encoder_path=model_path,
                                    document_encoder_path=model_path
                                    ,batch_size=32
                                    ,learning_rate=5e-6)
# Setup contriever
adore_retriever = ADORERetriever(adore_config)

In [None]:
adore_retriever.train(queries=queries, corpus=corpus, qrels=qrels, top_k=50, n_epochs=50)

In [None]:
adore_retriever.save_query_encoder("results")

In [None]:
# THIS MUST BE THE SAME AS THE SIMILARITY METRIC BEING USED IN ADORERETRIEVER.PY
similarity_measure = CosScore()
adore_retriever.load_query_encoder("results")

con.question_encoder = adore_retriever.question_encoder
con.question_encoder.to('cuda')
similar_docs_by_query_adore = con.retrieve(corpus, queries, top_k=100, score_function=similarity_measure, chunk=True, chunksize=400000)

In [None]:
top_k_docs_by_query_adore = evaluate_with_top_k(similar_docs_by_query_adore, k)

In [None]:
top_k_values = [1, 3, 5]

for top_k in top_k_values:
    output_file = f'llm_adore_results-{top_k}-0.csv'
    print(f"Running for top_k={top_k}")

    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query_adore, top_k)
    non_relevant_ignore_for_here = compute_all_non_relevant(0, False)
    matches, mismatches = evaluate_with_docs(top_k, top_k_similar_docs_by_query, non_relevant_ignore_for_here, 0)

    print("Matches:", matches)
    print("Mismatches:", mismatches)