## NLP Assignment: RAGs For Open Domain Complex QA


In [65]:
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.retriever.dense.Contriever import Contriever
from dexter.config.constants import Split
from typing import List
from typing import Dict
from dexter.data.datastructures.question import Question
from dexter.data.datastructures.evidence import Evidence
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity as CosScore
from dexter.utils.metrics.CoverExactMatch import CoverExactMatch
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
import json
import torch
import random

In [41]:
# Check torch version and make sure cuda is enabled and available
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

2.5.1
True
12.4


In [58]:
# Since china doesn't have access to huggingface, I have manually downloaded the model, feel free to comment this.
from transformers import AutoTokenizer, AutoModel
model_path = '../contriever'
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)

#model_path = "facebook/contriever" <- but uncomment this

### Task 1
Use off the shelf retriever (contriever) and extract contexts for each query to be given as input to a generative model. Use Exact Match or cover Exact Match as metric for evaluating generated answers. Experiment with k=1,3,5 for retrieving top-k contexts and report the performance on generating answers.

In [62]:
from dexter.llms.openai_engine import OpenAIEngine
from dexter.llms.llm_engine_orchestrator import LLMEngineOrchestrator
from dexter.data.loaders.WikiMultihopQADataLoader import WikiMultihopQADataLoader

config_instance = DenseHyperParams(query_encoder_path=model_path,
                                    document_encoder_path=model_path
                                    ,batch_size=32)

loader = RetrieverDataset("wikimultihopqa","wiki_musique_corpus","config.ini",Split.DEV,tokenizer=None)
result = loader.qrels()
queries: List[Question]
qrels: Dict[str, Dict[str, int]] # qrels[question id][evidence id] = 1/0 true false (i think) if relevant
corpus: List[Evidence]
queries, qrels, corpus = result
con = Contriever(config_instance)

# Setup LLM

config_instance = LLMEngineOrchestrator()
llm_instance = config_instance.get_llm_engine(data="",llm_class="openai",model_name="gpt-3.5-turbo")


# Things to consider below:

# loader = WikiMultihopQADataLoader("wikimultihopqa", config_path="est_config.ini", split=Split.DEV, batch_size=10)
# assert len(loader.raw_data) == len(loader.dataset)
# self.assertTrue(isinstance(loader.dataset, DprDataset))
# self.assertTrue(isinstance(loader.tokenizer, Tokenizer))
# self.assertTrue(isinstance(loader.raw_data[0], Sample))
# self.assertTrue(isinstance(loader.raw_data[0].evidences, Evidence))

## wikimultihop <- it had this in the google colab notebook, should we use it?
# with open("data/wiki_musique_corpus.json") as f:
#     corpus = json.load(f)

Loading passages: 100%|██████████| 563424/563424 [00:00<00:00, 1574308.94it/s]
Transforming passage dataset: 100%|██████████| 563424/563424 [00:00<00:00, 613485.72it/s]


Harley-Davidson Harley-Davidson
KeysView(<Section: Data-Path>)
12576


100%|██████████| 1200/1200 [01:46<00:00, 11.30it/s]


In [63]:
similarity_measure = CosScore()
k_values = [1,2,3]
metrics = RetrievalMetrics(k_values=k_values)

# Retrieve top-k contexts and evaluate with Exact Match
def evaluate_with_top_k(k_values: List[int]) -> Dict[str, Dict[str, float]]:
    response = con.retrieve(corpus, queries, top_k=max(k_values), score_function=similarity_measure)
    print("Indices retrieved:", len(response))
    print(metrics.evaluate_retrieval(qrels=qrels, results=response))
    return response

response: Dict[str, Dict[str, float]] = evaluate_with_top_k(k_values)

token_emb torch.Size([1200, 35, 768])
sentence_emb torch.Size([1200, 768])


  0%|          | 64/563424 [00:00<18:24, 510.17it/s]

Starting encoding of contexts....


100%|██████████| 563424/563424 [25:59<00:00, 361.40it/s]


context_embeddings torch.Size([563424, 768])
Indices retrieved: 1200
({'NDCG@1': 0.425, 'NDCG@2': 0.38567, 'NDCG@3': 0.33839}, {'MAP@1': 0.04277, 'MAP@2': 0.06457, 'MAP@3': 0.07541}, {'Recall@1': 0.04277, 'Recall@2': 0.07528, 'Recall@3': 0.09379}, {'P@1': 0.425, 'P@2': 0.37417, 'P@3': 0.31083})


### Task 2
Repeat the above experiment without the retriever, using only oracle contexts as input. Oracle
contexts are annotated documents provided for each question in dev.json.

### Task 3
Now randomly sample documents from the collection that are not relevant to the current query during inference on the evaluation set. Combine these documents with the top-k relevant documents and use them as input to the LLM for answering a query. You can decide the ratios to mix the relevant and the random documents that serve as noise. Analyze the performance.

In [72]:
from dexter.llms.llm_engine_orchestrator import LLMEngineOrchestrator
config_instance = LLMEngineOrchestrator()
llm_instance = config_instance.get_llm_engine(data="",llm_class="openai",model_name="gpt-3.5-turbo")

def evaluate_with_random_contexts(relevant_docs_by_query, relevant_ratio=0.5):
    results = {}
    i=0
    for query in queries:
        query_id = query.id()
        top_k_relevant_docs_ids = list(relevant_docs_by_query[query_id].keys())
        top_k_relevant_docs = [evidence for evidence in corpus if evidence.id in top_k_relevant_docs_ids]
        
        # Get non relevant docs from qrels (where qrels[query_id][doc_id] = 0)
        non_relevant_doc_ids = {doc_id for doc_id, relevance in qrels.get(query_id, {}).items() if relevance == 0}
        random_non_relevant_docs = [evidence for evidence in corpus if evidence.id in non_relevant_doc_ids]
        num_random_docs = int(len(top_k_relevant_docs) * (1 - relevant_ratio) / relevant_ratio)
        # sampled_docs = random_non_relevant_docs[:num_random_docs]
        random_docs = random.sample(random_non_relevant_docs, min(num_random_docs, len(non_relevant_doc_ids)))
        
        # Combine relevant and random non-relevant docs
        combined_docs = top_k_relevant_docs + random_docs
        
        # Pass to LLM for evaluation
        docs_as_string = "\n".join([doc.text for doc in combined_docs])
        answer = llm_instance.get_chat_completion(query.text, docs_as_string)
        # print(answer)
        if i < 5:
            print(answer)
            i+=1
        
        # Evaluate with Exact Match
        # results[query_id] = None

evaluate_with_random_contexts(response)

KeyError: 'huggingface_token'

### Task 4
 In this step, we will adopt a more principled approach to sample negative documents to be used as input to the RAG setup. Using a retrieval model, sample hard negatives from the collection for the
current query instead of random documents to inject as noise. hard negatives are documents that are related and close to the query in the vector space but do not help answer the question. This can be sampled by retrieving documents not in the list of ground truth documents for a query as measure by dot product.

### Task 5
Train a retrieval model using ADORE [14]. ADORE is optimized with hard negatives in a dense retrieval setup. Hence, it may be able to discern more relevant documents from large collections and lead to improved downstream answer generation performance. Using this retriever, retrieve relevant contexts followed by answer generation using LLMs. Compare it to the baseline performance of contriever based LLM QA mentioned in step 2 above.