## NLP Assignment: RAGs For Open Domain Complex QA


In [2]:
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.retriever.dense.Contriever import Contriever
from dexter.config.constants import Split
from typing import List
from typing import Dict
from dexter.data.datastructures.question import Question
from dexter.data.datastructures.evidence import Evidence
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity as CosScore
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
from tqdm import tqdm
import torch
import random
import csv



In [3]:
# Check torch version and make sure cuda is enabled and available
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.5.1
True
12.4


In [4]:
# -----------------------------------------
# Since china doesn't have access to huggingface, I have manually downloaded the model, feel free to comment this.
from transformers import AutoTokenizer, AutoModel

# Contriever taken from huggingface
model_path = 'huggingface/contriever'
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

# Llama taken from huggingface
model_name = "huggingface/llama"

# -----------------------------------------
# But uncomment these two
# model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_path = "facebook/contriever"

### Load the Dataset and set up contriever


In [5]:
from dexter.llms.llama_engine import LlamaEngine

queries: List[Question]
qrels: Dict[str, Dict[str, int]] # qrels[question id][evidence id] = 1/0 true false if relevant
corpus: List[Evidence]

config_instance = DenseHyperParams(query_encoder_path=model_path,
                                    document_encoder_path=model_path
                                    ,batch_size=32)

# Load dataset with dev set
loader = RetrieverDataset("wikimultihopqa","wiki_musique_corpus","config.ini",Split.DEV,tokenizer=None)
queries, qrels, corpus = loader.qrels()

# Extract ground truth answers for the questions
raw_data = loader.base_dataset.raw_data
question_ground_truth_answer_map = {sample.question.id(): sample.answer.text() for sample in raw_data}

# Setup contriever
con = Contriever(config_instance)


Loading passages: 100%|██████████| 563424/563424 [00:00<00:00, 1762967.49it/s]
Transforming passage dataset: 100%|██████████| 563424/563424 [00:00<00:00, 661839.72it/s]


Harley-Davidson Harley-Davidson
KeysView(<Section: Data-Path>)
12576


100%|██████████| 1200/1200 [01:31<00:00, 13.12it/s]


### Setup LLM

In [6]:
# Setup LLM
llm_instance = LlamaEngine(data="", model_name=model_name, temperature=0.3, top_n=1)

# Code to query llm
def query_llm(question_text: str, evidences: List[Evidence]):
    """
    :param question_text: question text
    :param evidences: list of evidences
    :return: the answer or None if no answer
    """
    evidence_text = "\n".join(doc.text() for doc in evidences)
    system_prompt = "Follow the given examples and Given the question and context output final answer for the question using information in the context and give answer in form of  [Final Answer]: \n"
    user_prompt = f"Question: {question_text}\nContext: {evidence_text}\nAnswer:"

    chain_answer = llm_instance.get_llama_completion(system_prompt, user_prompt)
    
    # Parse the response
    if "not possible" in chain_answer.lower() or "unknown" in chain_answer.lower():
        return None
    elif "[Final Answer]:" in chain_answer:
        answer = chain_answer.split("[Final Answer]:")[-1].strip()
        return answer
        
    return None

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


### Retrieve all "relevant" docs based on similarity

In [7]:
def get_top_k_from_retrieved(retrieved: Dict[str, Dict[str, float]], top_k: int) -> Dict[str, Dict[str, float]]:
    """
    Extract the top-k documents for each query from the retrieved results.
    This is to avoid calling con.retrieve multiple times.
    """
    top_k_results = {}
    for query_id, docs_scores in retrieved.items():
        sorted_docs = sorted(docs_scores.items(), key=lambda x: x[1], reverse=True)
        top_k_results[query_id] = {doc_id: score for doc_id, score in sorted_docs[:top_k]}
    return top_k_results

In [8]:
similarity_measure = CosScore()

# Calculate similarities for all queries and docs
similar_docs_by_query = con.retrieve(corpus, queries, top_k=100, score_function=similarity_measure, chunk=True, chunksize=400000)

100%|██████████| 400000/400000 [14:07<00:00, 472.23it/s]
100%|██████████| 163424/163424 [05:46<00:00, 471.79it/s]


### Task 1
Use off the shelf retriever (contriever) and extract contexts for each query to be given as input to a generative model. Use Exact Match or cover Exact Match as metric for evaluating generated answers. Experiment with k=1,3,5 for retrieving top-k contexts and report the performance on generating answers.

In [9]:
k_values = [1,2,3]
metrics = RetrievalMetrics(k_values=k_values)

# Evaluate top_k contexts with Exact Match, use previously retrieved results
def evaluate_with_top_k(retrieved, top_k: int) -> Dict[str, Dict[str, float]]:
    response = get_top_k_from_retrieved(retrieved, top_k)
    print("Indices retrieved:", len(response))
    print(metrics.evaluate_retrieval(qrels=qrels, results=response))
    return response

k = max(k_values)
top_k_docs_by_query: Dict[str, Dict[str, float]] = evaluate_with_top_k(similar_docs_by_query, k)

Indices retrieved: 1200
({'NDCG@1': 0.42417, 'NDCG@2': 0.3858, 'NDCG@3': 0.34182}, {'MAP@1': 0.04266, 'MAP@2': 0.0646, 'MAP@3': 0.0762}, {'Recall@1': 0.04266, 'Recall@2': 0.07536, 'Recall@3': 0.09527}, {'P@1': 0.42417, 'P@2': 0.37458, 'P@3': 0.31583})


### Task 2
Repeat the above experiment without the retriever, using only oracle contexts as input. Oracle
contexts are annotated documents provided for each question in dev.json.

### Task 3 & 4
Now randomly sample documents from the collection that are not relevant to the current query during inference on the evaluation set. Combine these documents with the top-k relevant documents and use them as input to the LLM for answering a query. You can decide the ratios to mix the relevant and the random documents that serve as noise. Analyze the performance.

In [10]:
def evaluate_query(query, non_relevant_doc_ids_of_query, top_k_similar_doc_ids_of_query):
    """
    Evaluates a single query by selecting relevant and non-relevant documents,
    combining them, passing them to the LLM, and comparing the answer to the ground truth.
    
    :param query: The query to be evaluated.
    :param non_relevant_doc_ids_of_query: The ids of the non-relevant documents.
    :param top_k_similar_doc_ids_of_query: The ids of the similar documents.
    :return: A boolean ismatch indicating whether the evaluation is correct.
    """
    # Fetch the actual similar documents (not just ids)
    top_k_similar_docs_of_query = [doc for doc in corpus if doc.id() in top_k_similar_doc_ids_of_query]
    
    # Convert selected doc IDs to actual documents
    non_relevant_docs = [doc for doc in corpus if doc.id() in non_relevant_doc_ids_of_query]
    
    # Combine relevant and selected non-relevant docs
    combined_docs = top_k_similar_docs_of_query + non_relevant_docs
    random.shuffle(combined_docs)
    
    # Evaluate the LLM answer
    answer = query_llm(query.text(), combined_docs)
    ground_truth_answer = question_ground_truth_answer_map[query.id()]
    
    # Check if the answer matches the ground truth
    is_match = answer is not None and ground_truth_answer.lower() in answer.lower()

    # Write the result to the CSV file
    relevant_text = "\n".join(doc.text() for doc in top_k_similar_docs_of_query)
    non_relevant_text = "\n".join(doc.text() for doc in non_relevant_docs)
    row = [query.text(), ground_truth_answer, answer, is_match, relevant_text, non_relevant_text]
    with open("llm_results.csv", mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(row)
    
    # print("Answer: ", answer)
    # print("Ground truth answer: ", ground_truth_answer)
    # print("Is match: ", is_match)
    
    return is_match

In [11]:
def compute_all_non_relevant(num_docs_to_sample, is_hard_negatives):
    """
    Computes all non-relevant docs (aka random or hard negatives) for a list of queries and a corpus.
    """
    all_non_relevant = {}

    for query in tqdm(queries, desc=f"Retrieving {num_docs_to_sample} non-relevant docs for each query", unit="query"):
        query_id = query.id()

        # Extract non-relevant document IDs based on qrels for this query
        non_relevant_doc_ids = {doc.id() for doc in corpus if not (doc.id() in qrels[query_id])}
        
        if is_hard_negatives:
            most_similar_docs = get_top_k_from_retrieved(similar_docs_by_query, 100)
            # Get similar and nonrelevant docs, remove the ones with similarity 0
            similar_and_non_relevant_doc_ids = {doc_id for doc_id in most_similar_docs[query_id] if most_similar_docs[query_id][doc_id] > 0 and doc_id in non_relevant_doc_ids}
            non_relevant_doc_ids = similar_and_non_relevant_doc_ids
            # sampled_non_relevant_ids = random.sample(non_relevant_doc_ids, num_docs_to_sample*5)  # This is a bit random
            # sampled_docs = [doc for doc in corpus if doc.id() in sampled_non_relevant_ids]
            # 
            # # Compute embeddings and cosine similarity for the sampled documents
            # query_embedding = con.encode_queries([query], batch_size=con.batch_size)  # Shape: [1, D] - encode query
            # non_relevant_embeddings = con.encode_corpus(sampled_docs)
            # cos_scores = similarity_measure.evaluate(query_embedding, non_relevant_embeddings)  # Shape: [1, sampled_size*5]
            # 
            # # Select documents with similarity > 0 (hard negatives: non-relevant but do have some similarity)
            # non_relevant_doc_ids = [doc.id() for doc, score in zip(sampled_docs, cos_scores[0].tolist()) if score > 0]

        if num_docs_to_sample > len(non_relevant_doc_ids):
            print("Not enough documents to sample from, please select smaller ratio or reduce the number of similar docs per query to select")
            return None
        all_non_relevant[query_id] = list(non_relevant_doc_ids)[:num_docs_to_sample]

    return all_non_relevant


In [12]:
def evaluate_with_docs(top_k, non_relevant_doc_ids_by_query, num_docs_to_sample):
    """
    General evaluation function for both random and hard negatives.
    """
    matches = 0
    mismatches = 0
    
    print(f"There are {top_k} relevant docs and {num_docs_to_sample} non-relative docs per query")
    
    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        header = ['Question', 'Ground Truth Answer', 'LLM Answer', 'Match', 'Relevant Documents', 'Non-Relevant Documents']
        writer.writerow(header)
    
    for query in tqdm(queries, desc="Evaluating queries", unit="query"):
        top_k_similar_doc_ids_of_query = set(top_k_similar_docs_by_query[query.id()].keys())
        non_relevant_doc_ids_of_query = non_relevant_doc_ids_by_query[query.id()]
        is_match = evaluate_query(query, non_relevant_doc_ids_of_query, top_k_similar_doc_ids_of_query)
        if is_match:
            matches += 1
        else:
            mismatches += 1
    return matches, mismatches

In [13]:
# Pairs: 3+9, 6+6, 9+3 or 6+18, 12+12, 18+6
top_k_values = [3, 6, 9, 6, 12, 18]
samples_values = [9, 6, 3, 18, 12, 6]

### Task 3

In [14]:
# Evaluate with random context
for top_k, num_random_docs_to_sample in zip(top_k_values, samples_values):
    output_file = f'llm_random_results-{top_k}-{num_random_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_random_docs_to_sample={num_random_docs_to_sample}")
    
    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    random_doc_ids_by_query = compute_all_non_relevant(num_random_docs_to_sample, False)
    matches, mismatches = evaluate_with_docs(top_k, random_doc_ids_by_query, num_docs_to_sample=num_random_docs_to_sample)
    
    print("Matches:", matches)
    print("Mismatches:", mismatches)

Running for top_k=3, num_random_docs_to_sample=9


Retrieving 9 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:21<00:00,  8.46query/s]


There are 3 relevant docs and 9 non-relative docs per query


Evaluating queries:   1%|          | 10/1200 [00:20<36:25,  1.84s/query]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating queries: 100%|██████████| 1200/1200 [53:13<00:00,  2.66s/query] 


Matches: 627
Mismatches: 573
Running for top_k=6, num_random_docs_to_sample=6


Retrieving 6 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:18<00:00,  8.68query/s]


There are 6 relevant docs and 6 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [50:35<00:00,  2.53s/query] 


Matches: 652
Mismatches: 548
Running for top_k=9, num_random_docs_to_sample=3


Retrieving 3 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:13<00:00,  8.97query/s]


There are 9 relevant docs and 3 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [45:47<00:00,  2.29s/query] 


Matches: 682
Mismatches: 518
Running for top_k=6, num_random_docs_to_sample=18


Retrieving 18 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:18<00:00,  8.69query/s]


There are 6 relevant docs and 18 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [1:04:49<00:00,  3.24s/query]


Matches: 680
Mismatches: 520
Running for top_k=12, num_random_docs_to_sample=12


Retrieving 12 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:14<00:00,  8.94query/s]


There are 12 relevant docs and 12 non-relative docs per query


Evaluating queries:  73%|███████▎  | 879/1200 [38:09<10:01,  1.87s/query]  This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Evaluating queries: 100%|██████████| 1200/1200 [52:20<00:00,  2.62s/query]


Matches: 708
Mismatches: 492
Running for top_k=18, num_random_docs_to_sample=6


Retrieving 6 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:14<00:00,  8.93query/s]


There are 18 relevant docs and 6 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [52:08<00:00,  2.61s/query] 

Matches: 706
Mismatches: 494





In [None]:
# Try with only contexts
samples_values = [12, 24, 40]
top_k = 0
for num_random_docs_to_sample in samples_values:
    output_file = f'llm_random_results-{top_k}-{num_random_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_random_docs_to_sample={num_random_docs_to_sample}")
    
    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    random_doc_ids_by_query = compute_all_non_relevant(num_random_docs_to_sample, False)
    matches, mismatches = evaluate_with_docs(top_k, random_doc_ids_by_query, num_docs_to_sample=num_random_docs_to_sample)
    
    print("Matches:", matches)
    print("Mismatches:", mismatches)

In [None]:
# Try with total objects=40 and more ratios
top_k_values = [4, 8, 12, 16, 20, 24, 28, 32, 36]
samples_values = [36, 32, 28, 24, 20, 16, 12, 8]
for top_k, num_random_docs_to_sample in zip(top_k_values, samples_values):
    output_file = f'llm_random_results-{top_k}-{num_random_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_random_docs_to_sample={num_random_docs_to_sample}")
    
    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    random_doc_ids_by_query = compute_all_non_relevant(num_random_docs_to_sample, False)
    matches, mismatches = evaluate_with_docs(top_k, random_doc_ids_by_query, num_docs_to_sample=num_random_docs_to_sample)
    
    print("Matches:", matches)
    print("Mismatches:", mismatches)

### Task 4
 In this step, we will adopt a more principled approach to sample negative documents to be used as input to the RAG setup. Using a retrieval model, sample hard negatives from the collection for the
current query instead of random documents to inject as noise. hard negatives are documents that are related and close to the query in the vector space but do not help answer the question. This can be sampled by retrieving documents not in the list of ground truth documents for a query as measure by dot product.

In [15]:
# Hard negatives are the documents both in retrieved docs (they are similar) and non-relevant docs (since they are not actually relevant)
# Evaluate with hard negative contexts
# Pairs: 3+9, 6+6, 9+3 or 6+18, 12+12, 18+6
top_k_values = [3, 6, 9, 6, 12, 18]
samples_values = [9, 6, 3, 18, 12, 6]

for top_k, num_hard_negatives_to_sample in zip(top_k_values, samples_values):
    output_file = f'llm_hard_neg_results-{top_k}-{num_hard_negatives_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_hard_neg_docs_to_sample={num_hard_negatives_to_sample}")
    
    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    hard_negative_doc_ids_by_query = compute_all_non_relevant(num_hard_negatives_to_sample, True)
    matches, mismatches = evaluate_with_docs(top_k, hard_negative_doc_ids_by_query, num_docs_to_sample=num_hard_negatives_to_sample)
    
    print("Matches:", matches)
    print("Mismatches:", mismatches)

Running for top_k=3, num_hard_neg_docs_to_sample=9


Retrieving 9 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:35<00:00,  7.72query/s]


There are 3 relevant docs and 9 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [43:27<00:00,  2.17s/query] 


Matches: 639
Mismatches: 561
Running for top_k=6, num_hard_neg_docs_to_sample=6


Retrieving 6 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:35<00:00,  7.71query/s]


There are 6 relevant docs and 6 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [42:59<00:00,  2.15s/query] 


Matches: 663
Mismatches: 537
Running for top_k=9, num_hard_neg_docs_to_sample=3


Retrieving 3 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:35<00:00,  7.72query/s]


There are 9 relevant docs and 3 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [41:51<00:00,  2.09s/query] 


Matches: 679
Mismatches: 521
Running for top_k=6, num_hard_neg_docs_to_sample=18


Retrieving 18 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:35<00:00,  7.71query/s]


There are 6 relevant docs and 18 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [48:03<00:00,  2.40s/query] 


Matches: 641
Mismatches: 559
Running for top_k=12, num_hard_neg_docs_to_sample=12


Retrieving 12 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:34<00:00,  7.75query/s]


There are 12 relevant docs and 12 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [47:35<00:00,  2.38s/query] 


Matches: 695
Mismatches: 505
Running for top_k=18, num_hard_neg_docs_to_sample=6


Retrieving 6 non-relevant docs for each query: 100%|██████████| 1200/1200 [02:35<00:00,  7.73query/s]


There are 18 relevant docs and 6 non-relative docs per query


Evaluating queries: 100%|██████████| 1200/1200 [47:21<00:00,  2.37s/query] 

Matches: 703
Mismatches: 497





In [None]:
# Add only hard negatives
# Try with only contexts
samples_values = [12, 24, 40]
top_k = 0
for num_hard_neg_docs_to_sample in samples_values:
    output_file = f'llm_hard_neg_results-{top_k}-{num_hard_neg_docs_to_sample}.csv'
    print(f"Running for top_k={top_k}, num_hard_neg_docs_to_sample={num_hard_neg_docs_to_sample}")
    
    top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)
    hard_negative_doc_ids_by_query = compute_all_non_relevant(num_hard_neg_docs_to_sample, True)
    matches, mismatches = evaluate_with_docs(top_k, hard_negative_doc_ids_by_query, num_docs_to_sample=num_hard_neg_docs_to_sample)
    
    print("Matches:", matches)
    print("Mismatches:", mismatches)

### Task 5
Train a retrieval model using ADORE [14]. ADORE is optimized with hard negatives in a dense retrieval setup. Hence, it may be able to discern more relevant documents from large collections and lead to improved downstream answer generation performance. Using this retriever, retrieve relevant contexts followed by answer generation using LLMs. Compare it to the baseline performance of contriever based LLM QA mentioned in step 2 above.