## NLP Assignment: RAGs For Open Domain Complex QA


In [1]:
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.retriever.dense.Contriever import Contriever
from dexter.config.constants import Split
from typing import List
from typing import Dict
from dexter.data.datastructures.question import Question
from dexter.data.datastructures.evidence import Evidence
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity as CosScore
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
import torch
import random

In [2]:
# Check torch version and make sure cuda is enabled and available
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.5.1
True
12.4


In [3]:
# -----------------------------------------
# Since china doesn't have access to huggingface, I have manually downloaded the model, feel free to comment this.
from transformers import AutoTokenizer, AutoModel

# Contriever taken from huggingface
model_path = 'huggingface/contriever'
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

# Llama taken from huggingface
model_name = "huggingface/llama"

# -----------------------------------------
# But uncomment these two
# model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_path = "facebook/contriever"

  return torch.load(checkpoint_file, map_location="cpu")


### Load the Dataset and setup LLM


In [4]:
from dexter.llms.llama_engine import LlamaEngine
from dexter.llms.llm_engine_orchestrator import LLMEngineOrchestrator

queries: List[Question]
qrels: Dict[str, Dict[str, int]] # qrels[question id][evidence id] = 1/0 true false if relevant
corpus: List[Evidence]

config_instance = DenseHyperParams(query_encoder_path=model_path,
                                    document_encoder_path=model_path
                                    ,batch_size=32)

# Load dataset with dev set
loader = RetrieverDataset("wikimultihopqa","wiki_musique_corpus","config.ini",Split.DEV,tokenizer=None)
result = loader.qrels()
queries, qrels, corpus = result
raw_data = loader.base_dataset.raw_data

question_ground_truth_answer_map = {sample.question.id(): sample.answer.text() for sample in raw_data}


# Setup contriever
con = Contriever(config_instance)

# Things to consider below on how to get the answers to the questions

# loader = WikiMultihopQADataLoader("wikimultihopqa", config_path="est_config.ini", split=Split.DEV, batch_size=10)
# assert len(loader.raw_data) == len(loader.dataset)
# self.assertTrue(isinstance(loader.dataset, DprDataset))
# self.assertTrue(isinstance(loader.tokenizer, Tokenizer))
# self.assertTrue(isinstance(loader.raw_data[0], Sample))
# self.assertTrue(isinstance(loader.raw_data[0].evidences, Evidence))


Loading passages: 100%|██████████| 563424/563424 [00:00<00:00, 1710333.73it/s]
Transforming passage dataset: 100%|██████████| 563424/563424 [00:00<00:00, 589884.76it/s]


Harley-Davidson Harley-Davidson
KeysView(<Section: Data-Path>)
12576


100%|██████████| 1200/1200 [01:31<00:00, 13.11it/s]


In [6]:
# Setup LLM
config_instance = LLMEngineOrchestrator()
llm_instance = LlamaEngine(data="", model_name=model_name, temperature=0.3, top_n=1)

# Code to query llm
def query_llm(question_text: str, evidences: List[Evidence]):
    """
    :param question_text: question text
    :param evidences: list of evidences
    :return: the answer or None if no answer
    """
    evidence_text = " ".join(doc.text for doc in evidences)
    system_prompt = "Follow the given examples and Given the question and context output final answer for the question using information in the context and give answer in form of  [Final Answer]: \n"
    user_prompt = f"Question: {question_text}\nContext: {evidence_text}\nAnswer:"

    chain_answer = llm_instance.get_llama_completion(system_prompt, user_prompt)
    
    # Parse the response
    if "not possible" in chain_answer.lower() or "unknown" in chain_answer.lower():
        return None
    elif "[Final Answer]:" in chain_answer:
        answer = chain_answer.split("[Final Answer]:")[-1].strip()
        return answer
    return None

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Task 1
Use off the shelf retriever (contriever) and extract contexts for each query to be given as input to a generative model. Use Exact Match or cover Exact Match as metric for evaluating generated answers. Experiment with k=1,3,5 for retrieving top-k contexts and report the performance on generating answers.

In [None]:
similarity_measure = CosScore()
k_values = [1,2,3]
metrics = RetrievalMetrics(k_values=k_values)

# Retrieve top-k contexts and evaluate with Exact Match
def evaluate_with_top_k(k_values: List[int]) -> Dict[str, Dict[str, float]]:
    response = con.retrieve(corpus, queries, top_k=max(k_values), score_function=similarity_measure)
    print("Indices retrieved:", len(response))
    print(metrics.evaluate_retrieval(qrels=qrels, results=response))
    return response

response: Dict[str, Dict[str, float]] = evaluate_with_top_k(k_values)

token_emb torch.Size([1200, 35, 768])
sentence_emb torch.Size([1200, 768])


  0%|          | 0/563424 [00:00<?, ?it/s]

Starting encoding of contexts....


 11%|█         | 62240/563424 [29:37<3:58:42, 34.99it/s] 

### Task 2
Repeat the above experiment without the retriever, using only oracle contexts as input. Oracle
contexts are annotated documents provided for each question in dev.json.

### Task 3
Now randomly sample documents from the collection that are not relevant to the current query during inference on the evaluation set. Combine these documents with the top-k relevant documents and use them as input to the LLM for answering a query. You can decide the ratios to mix the relevant and the random documents that serve as noise. Analyze the performance.

In [None]:
def evaluate_with_random_contexts(relevant_docs_by_query, relevant_ratio=0.5):
    matches = 0
    mismatches = 0
    results = {}
    i = 0
    for query in queries:
        query_id = query.id()
        top_k_relevant_docs_ids = list(relevant_docs_by_query[query_id].keys())
        top_k_relevant_docs = [evidence for evidence in corpus if evidence.id in top_k_relevant_docs_ids]
        
        # Get non relevant docs from qrels (where qrels[query_id][doc_id] = 0)
        non_relevant_doc_ids = {doc_id for doc_id, relevance in qrels.get(query_id, {}).items() if relevance == 0}
        random_non_relevant_docs = [evidence for evidence in corpus if evidence.id in non_relevant_doc_ids]
        num_random_docs = int(len(top_k_relevant_docs) * (1 - relevant_ratio) / relevant_ratio)
        # TODO think on how to not get duplicates
        random_docs = random_non_relevant_docs[:num_random_docs]
        # random_docs = random.sample(random_non_relevant_docs, min(num_random_docs, len(non_relevant_doc_ids)))
        
        # Combine relevant and random non-relevant docs
        combined_docs = top_k_relevant_docs + random_docs
        random.shuffle(combined_docs)
        
        # Pass to LLM for evaluation
        answer = query_llm(query.text(), combined_docs)
        ground_truth_answer = question_ground_truth_answer_map[query.id()]
        
        # Evaluate the answer
        if answer is not None and ground_truth_answer.lower() in answer.lower():
            matches += 1
        else:
            mismatches += 1
        
        results[query_id] = answer
        if i == 5:
            break
        i+=1

evaluate_with_random_contexts(response)

### Task 4
 In this step, we will adopt a more principled approach to sample negative documents to be used as input to the RAG setup. Using a retrieval model, sample hard negatives from the collection for the
current query instead of random documents to inject as noise. hard negatives are documents that are related and close to the query in the vector space but do not help answer the question. This can be sampled by retrieving documents not in the list of ground truth documents for a query as measure by dot product.

### Task 5
Train a retrieval model using ADORE [14]. ADORE is optimized with hard negatives in a dense retrieval setup. Hence, it may be able to discern more relevant documents from large collections and lead to improved downstream answer generation performance. Using this retriever, retrieve relevant contexts followed by answer generation using LLMs. Compare it to the baseline performance of contriever based LLM QA mentioned in step 2 above.