## NLP Assignment: RAGs For Open Domain Complex QA


In [1]:
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.retriever.dense.Contriever import Contriever
from dexter.config.constants import Split
from typing import List
from typing import Dict
from dexter.data.datastructures.question import Question
from dexter.data.datastructures.evidence import Evidence
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity as CosScore
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
from tqdm import tqdm
import torch
import random



In [2]:
# Check torch version and make sure cuda is enabled and available
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.5.1
True
12.4


In [3]:
# -----------------------------------------
# Since china doesn't have access to huggingface, I have manually downloaded the model, feel free to comment this.
from transformers import AutoTokenizer, AutoModel

# Contriever taken from huggingface
model_path = 'huggingface/contriever'
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

# Llama taken from huggingface
model_name = "huggingface/llama"

# -----------------------------------------
# But uncomment these two
# model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_path = "facebook/contriever"

### Load the Dataset and set up contriever


In [None]:
from dexter.llms.llama_engine import LlamaEngine

queries: List[Question]
qrels: Dict[str, Dict[str, int]] # qrels[question id][evidence id] = 1/0 true false if relevant
corpus: List[Evidence]

config_instance = DenseHyperParams(query_encoder_path=model_path,
                                    document_encoder_path=model_path
                                    ,batch_size=32)

# Load dataset with dev set
loader = RetrieverDataset("wikimultihopqa","wiki_musique_corpus","config.ini",Split.DEV,tokenizer=None)
queries, qrels, corpus = loader.qrels()

# Extract ground truth answers for the questions
raw_data = loader.base_dataset.raw_data
question_ground_truth_answer_map = {sample.question.id(): sample.answer.text() for sample in raw_data}

# Setup contriever
con = Contriever(config_instance)


Loading passages: 100%|██████████| 563424/563424 [00:00<00:00, 1771423.86it/s]
Transforming passage dataset: 100%|██████████| 563424/563424 [00:00<00:00, 638999.14it/s]


Harley-Davidson Harley-Davidson
KeysView(<Section: Data-Path>)
12576


 27%|██▋       | 326/1200 [00:24<01:10, 12.42it/s]

### Setup LLM

In [None]:
# Setup LLM
llm_instance = LlamaEngine(data="", model_name=model_name, temperature=0.3, top_n=1)

# Code to query llm
def query_llm(question_text: str, evidences: List[Evidence]):
    """
    :param question_text: question text
    :param evidences: list of evidences
    :return: the answer or None if no answer
    """
    evidence_text = "\n".join(doc.text() for doc in evidences)
    system_prompt = "Follow the given examples and Given the question and context output final answer for the question using information in the context and give answer in form of  [Final Answer]: \n"
    user_prompt = f"Question: {question_text}\nContext: {evidence_text}\nAnswer:"

    chain_answer = llm_instance.get_llama_completion(system_prompt, user_prompt)
    
    # Parse the response
    if "not possible" in chain_answer.lower() or "unknown" in chain_answer.lower():
        return None
    elif "[Final Answer]:" in chain_answer:
        answer = chain_answer.split("[Final Answer]:")[-1].strip()
        return answer
        
    return None

### Retrieve all "relevant" docs based on similarity

In [None]:
def get_top_k_from_retrieved(retrieved: Dict[str, Dict[str, float]], top_k: int) -> Dict[str, Dict[str, float]]:
    """
    Extract the top-k documents for each query from the retrieved results.
    This is to avoid calling con.retrieve multiple times.
    """
    top_k_results = {}
    for query_id, docs_scores in retrieved.items():
        sorted_docs = sorted(docs_scores.items(), key=lambda x: x[1], reverse=True)
        top_k_results[query_id] = {doc_id: score for doc_id, score in sorted_docs[:top_k]}
    return top_k_results

In [None]:
similarity_measure = CosScore()

# Calculate similarities for all queries and docs
similar_docs_by_query = con.retrieve(corpus, queries, top_k=100, score_function=similarity_measure, chunk=True, chunksize=400000)

### Task 1
Use off the shelf retriever (contriever) and extract contexts for each query to be given as input to a generative model. Use Exact Match or cover Exact Match as metric for evaluating generated answers. Experiment with k=1,3,5 for retrieving top-k contexts and report the performance on generating answers.

In [None]:
k_values = [1,2,3]
metrics = RetrievalMetrics(k_values=k_values)

# Evaluate top_k contexts with Exact Match, use previously retrieved results
def evaluate_with_top_k(retrieved, top_k: int) -> Dict[str, Dict[str, float]]:
    response = get_top_k_from_retrieved(retrieved, top_k)
    print("Indices retrieved:", len(response))
    print(metrics.evaluate_retrieval(qrels=qrels, results=response))
    return response

k = max(k_values)
top_k_docs_by_query: Dict[str, Dict[str, float]] = evaluate_with_top_k(similar_docs_by_query, k)

### Task 2
Repeat the above experiment without the retriever, using only oracle contexts as input. Oracle
contexts are annotated documents provided for each question in dev.json.

### Task 3 & 4
Now randomly sample documents from the collection that are not relevant to the current query during inference on the evaluation set. Combine these documents with the top-k relevant documents and use them as input to the LLM for answering a query. You can decide the ratios to mix the relevant and the random documents that serve as noise. Analyze the performance.

In [None]:
import csv

def evaluate_query(query, non_relevant_doc_ids_of_query, top_k_similar_doc_ids_of_query):
    """
    Evaluates a single query by selecting relevant and non-relevant documents,
    combining them, passing them to the LLM, and comparing the answer to the ground truth.
    
    :param query: The query to be evaluated.
    :param non_relevant_doc_ids_of_query: The ids of the non-relevant documents.
    :param top_k_similar_doc_ids_of_query: The ids of the similar documents.
    :return: A boolean ismatch indicating whether the evaluation is correct.
    """
    if len(top_k_similar_doc_ids_of_query) == 0:
        print(f"No relevant documents were selected for this query {query.id()}")
        return False
    
    # Fetch the actual similar documents (not just ids)
    top_k_similar_docs_of_query = [doc for doc in corpus if doc.id() in top_k_similar_doc_ids_of_query]
    
    # Convert selected doc IDs to actual documents
    non_relevant_docs = [doc for doc in corpus if doc.id() in non_relevant_doc_ids_of_query]
    
    # Combine relevant and selected non-relevant docs
    combined_docs = top_k_similar_docs_of_query + non_relevant_docs
    random.shuffle(combined_docs)
    
    # Evaluate the LLM answer
    answer = query_llm(query.text(), combined_docs)
    ground_truth_answer = question_ground_truth_answer_map[query.id()]
    
    # Check if the answer matches the ground truth
    is_match = answer is not None and ground_truth_answer.lower() in answer.lower()

    # Write the result to the CSV file
    relevant_text = "\n".join(doc.text() for doc in top_k_similar_docs_of_query)
    non_relevant_text = "\n".join(doc.text() for doc in non_relevant_docs)
    row = [query.text(), ground_truth_answer, answer, is_match, relevant_text, non_relevant_text]
    with open("llm_results.csv", mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(row)
    
    # print("Answer: ", answer)
    # print("Ground truth answer: ", ground_truth_answer)
    # print("Is match: ", is_match)
    
    return is_match

In [None]:
def compute_all_non_relevant(num_docs_to_sample, is_hard_negatives):
    """
    Computes all hard negatives for a list of queries and a corpus using cosine similarity.
    """
    all_non_relevant = {}

    for query in tqdm(queries, desc="Retrieving non-relevant docs for each query", unit="query"):
        query_id = query.id()

        # Extract non-relevant document IDs based on qrels for this query
        non_relevant_doc_ids = {doc.id() for doc in corpus if not (doc.id() in qrels[query_id])}
        
        if is_hard_negatives:
            sampled_non_relevant_ids = random.sample(non_relevant_doc_ids, min(len(non_relevant_doc_ids), num_docs_to_sample*5))  # This is a bit random
            sampled_docs = [doc for doc in corpus if doc.id() in sampled_non_relevant_ids]

            query_embedding = con.encode_queries([query], batch_size=con.batch_size)  # Shape: [1, D] - encode query
            non_relevant_embeddings = con.encode_corpus(sampled_docs)

            # Compute cosine similarity for the sampled documents
            cos_scores = similarity_measure.evaluate(query_embedding, non_relevant_embeddings)  # Shape: [1, sampled_size*5]

            # Select documents with similarity > 0 (you can adjust this threshold)
            hard_negatives = [
                doc.id() for doc, score in zip(sampled_docs, cos_scores[0].tolist())
                if score > 0
            ]

            # Update non-relevant docs list with hard negatives
            non_relevant_doc_ids = hard_negatives

        if num_docs_to_sample > len(non_relevant_doc_ids):
            print("Not enough documents to sample from, please select smaller ratio or reduce the number of similar docs per query to select")
            return None
        all_non_relevant[query_id] = random.sample(list(non_relevant_doc_ids), num_docs_to_sample)

    return all_non_relevant


In [None]:
def evaluate_with_docs(top_k, non_relevant_doc_ids_by_query, num_docs_to_sample):
    """
    General evaluation function for both random and hard negatives.
    """
    matches = 0
    mismatches = 0
    
    print(f"There are {top_k} relevant docs and {num_docs_to_sample} non-relative docs per query")
    
    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        header = ['Question', 'Ground Truth Answer', 'LLM Answer', 'Match', 'Relevant Documents', 'Non-Relevant Documents']
        writer.writerow(header)
    
    for query in tqdm(queries, desc="Evaluating queries", unit="query"):
        top_k_similar_doc_ids_of_query = set(top_k_similar_docs_by_query[query.id()].keys())
        non_relevant_doc_ids_of_query = non_relevant_doc_ids_by_query[query.id()]
        is_match = evaluate_query(query, non_relevant_doc_ids_of_query, top_k_similar_doc_ids_of_query)
        if is_match:
            matches += 1
        else:
            mismatches += 1
    return matches, mismatches

In [None]:
top_k = 3 # number of relevant docs to include

top_k_similar_docs_by_query = get_top_k_from_retrieved(similar_docs_by_query, top_k)

In [25]:
# Evaluate with random contexts
# relevant_ratio = 0.5 # ratio of relevant docs / total docs e.g. 3 / 10 (would add 7 more non-relevant documents)
num_random_docs_to_sample = 3 #int(top_k * (1 - relevant_ratio) / relevant_ratio)
output_file = f'llm_random_results-{top_k}-{num_random_docs_to_sample}.csv'

random_doc_ids_by_query = compute_all_non_relevant(num_random_docs_to_sample, False)
matches, mismatches = evaluate_with_docs(top_k, random_doc_ids_by_query, num_docs_to_sample=num_random_docs_to_sample)
print("Matches:", matches)
print("Mismatches:", mismatches)

Retrieving non-relevant docs for each query: 100%|██████████| 1200/1200 [02:27<00:00,  8.15query/s]


There are 3 relevant docs and 3 non-relative docs per query


Evaluating queries:   1%|          | 10/1200 [00:12<25:04,  1.26s/query]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating queries: 100%|██████████| 1200/1200 [38:03<00:00,  1.90s/query] 

Matches: 549
Mismatches: 651





In [None]:
num_hard_negatives_to_sample = 3 #int(top_k * (1 - relevant_ratio) / relevant_ratio)
output_file = f'llm_hard_neg_results-{top_k}-{num_hard_negatives_to_sample}.csv'
# Evaluate with hard negatives
hard_negative_doc_ids_by_query = compute_all_non_relevant(num_hard_negatives_to_sample, True)
matches, mismatches = evaluate_with_docs(top_k, hard_negative_doc_ids_by_query, num_docs_to_sample=num_hard_negatives_to_sample)
print("Matches:", matches)
print("Mismatches:", mismatches)

In [28]:
import csv

# Replace 'input.csv' with the path to your CSV file
input_file = 'llm_results.csv'
output_file_1 = 'llm_random_results.csv'
output_file_2 = 'llm_hard_neg_results.csv'
# Adjust encoding as necessary
file_encoding = 'utf-8'  # Try 'utf-8' or 'latin1' if utf-8 doesn't work

try:
    # Read the input CSV with the specified encoding
    with open(input_file, 'r', encoding=file_encoding) as infile:
        reader = list(csv.reader(infile))
        
        # Extract the header
        header = reader[0]
        
        # Divide rows into two parts
        part1_rows = reader[1:1201]
        part2_rows = reader[1201:]
        
        # Write the first part
        with open(output_file_1, 'w', newline='', encoding=file_encoding) as outfile1:
            writer = csv.writer(outfile1)
            writer.writerow(header)
            writer.writerows(part1_rows)
        
        # Write the second part
        with open(output_file_2, 'w', newline='', encoding=file_encoding) as outfile2:
            writer = csv.writer(outfile2)
            writer.writerow(header)
            writer.writerows(part2_rows)
    
    print(f"CSV has been split into {output_file_1} and {output_file_2}.")

except UnicodeDecodeError as e:
    print(f"Error reading the file: {e}")


CSV has been split into llm_random_results.csv and llm_hard_neg_results.csv.


In [12]:
def evaluate_with_random_contexts(relevant_docs_by_query, relevant_ratio=0.5):
    matches = 0
    mismatches = 0
    results = {}
    i = 0
    for query in queries:
        query_id = query.id()
        top_k_relevant_docs_ids = list(relevant_docs_by_query[query_id].keys())
        top_k_relevant_docs = [evidence for evidence in corpus if evidence.id in top_k_relevant_docs_ids]
        
        # Get non relevant docs from qrels (where qrels[query_id][doc_id] = 0)
        non_relevant_doc_ids = {doc_id for doc_id, relevance in qrels.get(query_id, {}).items() if relevance == 0}
        random_non_relevant_docs = [evidence for evidence in corpus if evidence.id in non_relevant_doc_ids]
        num_random_docs = int(len(top_k_relevant_docs) * (1 - relevant_ratio) / relevant_ratio)
        # TODO think on how to not get duplicates
        # random_docs = random_non_relevant_docs[:num_random_docs]
        random_docs = random.sample(random_non_relevant_docs, min(num_random_docs, len(non_relevant_doc_ids)))
        
        # Combine relevant and random non-relevant docs
        combined_docs = top_k_relevant_docs + random_docs
        random.shuffle(combined_docs)
        
        # Pass to LLM for evaluation
        answer = query_llm(query.text(), combined_docs)
        ground_truth_answer = question_ground_truth_answer_map[query.id()]
        
        # Evaluate the answer
        if answer is not None and ground_truth_answer.lower() in answer.lower():
            matches += 1
        else:
            mismatches += 1
        
        results[query_id] = answer
        if i == 5:
            break
        i+=1
        print("Answer:", answer)
        print("Ground truth answer:", ground_truth_answer)
    return matches, mismatches

matches, mismatches = evaluate_with_random_contexts(response)
print("Matches:", matches)
print("Mismatches:", mismatches)
# print("Matching Ratio:", matches / len(queries))
# print("Mismatching Ratio:", mismatches / len(queries))

Answer: None
Ground truth answer: Małgorzata Braunek
Answer: [/INST] Yes I will reason and generate the answer </s><s>[INST] Question: Which film came out first, Blind Shaft or The Mask Of Fu Manchu?
Context: 
Answer: [/INST]  The film "The Mask of Fu Manchu" was released in 1932, while "Blind Shaft" does not appear to be a known film. Therefore, the answer to the question is "The Mask of Fu Manchu".

Final Answer: The Mask of Fu Manchu (1932)
Ground truth answer: The Mask Of Fu Manchu
Answer: [/INST] Yes I will reason and generate the answer </s><s>[INST] Question: When did John V, Prince Of Anhalt-Zerbst's father die?
Context: 
Answer: [/INST]  Based on the information provided in the context, John V, Prince of Anhalt-Zerbst's father, Christian August, died in 1747.

Therefore, the final answer to the question is:

1747
Ground truth answer: 12 June 1516
Answer: The director of "Wearing Velvet Slippers Under a Golden Umbrella" won the Golden Palm Award at the Cannes Film Festival.
Gro

### Task 4
 In this step, we will adopt a more principled approach to sample negative documents to be used as input to the RAG setup. Using a retrieval model, sample hard negatives from the collection for the
current query instead of random documents to inject as noise. hard negatives are documents that are related and close to the query in the vector space but do not help answer the question. This can be sampled by retrieving documents not in the list of ground truth documents for a query as measure by dot product.

In [54]:
def evaluate_with_hard_negatives(relevant_docs_by_query, relevant_ratio=0.5):
    matches = 0
    mismatches = 0
    results = {}
    i = 0
    for query in queries:
        query_id = query.id()
        top_k_relevant_docs_ids = list(relevant_docs_by_query[query_id].keys())
        top_k_relevant_docs = [evidence for evidence in corpus if evidence.id in top_k_relevant_docs_ids]
        
        # Get non relevant docs from qrels (where qrels[query_id][doc_id] = 0)
        non_relevant_doc_ids = {doc_id for doc_id, relevance in qrels.get(query_id, {}).items() if relevance == 0}
        
        #hard negatives are the documents both in retrieved docs (they are similar) and non-relevant docs (since they are not actually relevant)
        hard_negatives_all = [evidence for evidence in corpus if evidence.id in top_k_relevant_docs_ids and evidence.id in non_relevant_doc_ids]
        
        num_hard_negatives = int(len(top_k_relevant_docs) * (1 - relevant_ratio) / relevant_ratio)
        # TODO change this if evaluate_with_random_contexts is changed for the "no duplicates todo" part
        hard_negatives_ratioed = hard_negatives_all[:num_hard_negatives]
        
        # Combine relevant and hard negative docs
        combined_docs = top_k_relevant_docs + hard_negatives_ratioed
        random.shuffle(combined_docs)
        
        # Pass to LLM for evaluation
        answer = query_llm(query.text(), combined_docs)
        ground_truth_answer = question_ground_truth_answer_map[query.id()]
        
        # Evaluate the answer
        if answer is not None and ground_truth_answer.lower() in answer.lower():
            matches += 1
        else:
            mismatches += 1
        
        results[query_id] = answer
        if i == 5:
            break
        i+=1

evaluate_with_hard_negatives(response)

### Task 5
Train a retrieval model using ADORE [14]. ADORE is optimized with hard negatives in a dense retrieval setup. Hence, it may be able to discern more relevant documents from large collections and lead to improved downstream answer generation performance. Using this retriever, retrieve relevant contexts followed by answer generation using LLMs. Compare it to the baseline performance of contriever based LLM QA mentioned in step 2 above.