## Environment Setup

Package Installation

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# downloading packages for running the notebook
import sys
import subprocess

subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt', '--quiet'])

0

In [3]:
import os
import faiss
import torch
from pathlib import Path
from functools import partial
from typing import List, Optional
from tempfile import TemporaryDirectory
from dataclasses import dataclass, field
from datasets import Value, Features, Sequence, load_dataset
from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast, HfArgumentParser, RagRetriever, RagSequenceForGeneration, RagTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

## Text Preprocessing Functions

In [4]:
def split_text(text: str, n=100, character=" ") -> List[str]:
    """
        Chunking the document for Indexing.
        Split the text every n-th occurrence of character. 
    """
    
    text = text.split(character)
    
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]


def split_documents(documents: dict) -> dict:
    """
        Split documents into passages
    """
    
    titles, texts = [], []
    for title, text in zip(documents["title"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    
    return {"title": titles, "text": texts}


def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast) -> dict:
    """
        Compute the DPR embeddings of document passages
    """
    
    input_ids = ctx_tokenizer(documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt")["input_ids"]
    embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output
    
    return {"embeddings": embeddings.detach().cpu().numpy()}

Note: The following code block is used to cleanup the dataset to create question answer pair for RAG setup.

In [5]:
with open('dataset.txt', 'r') as f:
    data = f.read()
    
data = data.replace('\n\n','\n')

In [6]:
data = data.split('---')

for i in range(0, len(data)):
    if i==4:
        data[i] = data[i].replace('\n**', '\n###').replace('**','')
    elif i==3:
        data[i] = data[i].replace('**','')
    else:
        data[i] = data[i].replace('**','')

In [7]:
ques_ans = dict()
for i in range(0, len(data)):
    temp = data[i]
    temp = temp.split('\n###')
    
    for j in range(1, len(temp)):
        tp = temp[j].split('\n')
        ques_ans[tp[0]] = " ".join(tp[1:])

In [8]:
import pandas as pd

qa_dict = dict()
qa_dict['title'] = []
qa_dict['text'] = []

for key, value in ques_ans.items():
    qa_dict['title'].append(key)
    qa_dict['text'].append(value)
    
qa_df = pd.DataFrame.from_dict(qa_dict)

In [9]:
from datasets import Dataset

# You can load a Dataset object this way
dataset = Dataset.from_pandas(qa_df)

dataset

Dataset({
    features: ['title', 'text'],
    num_rows: 50
})

Note: To use the DPR Context Encoder the Question/Answer Pairs need to be kept as Title/Text, as the model has been built on the same perspective.

In [11]:
# And compute the embeddings
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device=device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

new_features = Features(
    {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float64"))}
)  # optional, save as float32 instead of float64 to save space

dataset = dataset.map(
    partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
    batched=True,
    batch_size=8,
    features=new_features,
)

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [12]:
# Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search

index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
dataset.add_faiss_index("embeddings", custom_index=index)

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'title', 'embeddings'],
    num_rows: 50
})

Refer to this HF Repo - https://huggingface.co/facebook/rag-token-nq and Paper - https://arxiv.org/pdf/2005.11401.pdf for more context.

In [14]:
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq", index_name="custom", indexed_dataset=dataset
)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever, force_download=True)
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

Downloading config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [15]:
question = "What are the documents required to apply for the new pan"
input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]
generated = model.generate(input_ids)
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

print(question)
print(generated_string.strip())

What are the documents required to apply for the new pan
a citizenship renunciation letter


In [16]:
question = "WHow long does it usually take to receive the PAN card after applying?"
input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]
generated = model.generate(input_ids)
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

print(question)
print(generated_string.strip())

WHow long does it usually take to receive the PAN card after applying?
3 weeks


In [17]:
question = "What is the cost/fees of a PAN card?"
input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]
generated = model.generate(input_ids)
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

print(question)
print(generated_string.strip())

What is the cost/fees of a PAN card?
us $ 2,500


## Not Proceeding Further as the results are not good enough !