# Dependencies

In [1]:
!pip install langchain transformers requests 'PyPDF2<3.0' pdfminer.six faiss-cpu



In [2]:
import torch
import numpy as np
import faiss
import PyPDF2
import os

from transformers import BertTokenizer, BertModel
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer, BartForQuestionAnswering
from transformers import BartForConditionalGeneration, BartTokenizer, AutoTokenizer, AutoModelWithLMHead, T5ForConditionalGeneration, T5Tokenizer


from langchain.embeddings import HuggingFaceEmbeddings
from langchain import text_splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

In [3]:
device = torch.device("cpu")
if torch.cuda.is_available():
   print("Training on GPU")
   device = torch.device("cuda:0")

Training on GPU


# Loading text from file

In [4]:
file_url = "https://arxiv.org/pdf/1706.03762.pdf"
file_path = "assets/attention.pdf"

if not os.path.exists('assets'):
    os.mkdir('assets')

if not os.path.isfile(file_path):
    !curl -o $file_path $file_url
else:
    print("File already exists!")


File already exists!


# RAG

In [5]:
class Retriever:

  def __init__(self, file_path, device, context_model_name, question_model_name):
    self.file_path = file_path
    self.device = device

    self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(context_model_name)
    self.context_model = DPRContextEncoder.from_pretrained(context_model_name).to(device)

    self.question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(question_model_name)
    self.question_model = DPRQuestionEncoder.from_pretrained(question_model_name).to(device)

  def extract_text_from_pdf(self, file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

  def get_text(self):
    with open(self.file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

  def token_len(self, text):
    tokens = self.context_tokenizer.encode(text)
    return len(tokens)

  def load_chunks(self):
    self.text = self.extract_text_from_pdf(self.file_path)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=50,
        chunk_overlap=20,
        length_function=self.token_len,
        separators=["Section", "\n\n", "\n", ".", " ", ""]
    )

    self.chunks = text_splitter.split_text(self.text)

  def load_context_embeddings(self):
    encoded_input = self.context_tokenizer(self.chunks, return_tensors='pt', padding=True, truncation=True, max_length=300).to(device)

    with torch.no_grad():
      model_output = self.context_model(**encoded_input)
      self.token_embeddings = model_output.pooler_output.cpu().detach().numpy()

    self.index = faiss.IndexFlatL2(self.token_embeddings.shape[1])
    self.index.add(self.token_embeddings)

  def retrieve_top_k(self, query_prompt, k=10):
    encoded_query = self.question_tokenizer(query_prompt, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        model_output = self.question_model(**encoded_query)
        query_vector = model_output.pooler_output

    query_vector_np = query_vector.cpu().numpy()
    D, I = self.index.search(query_vector_np, k)

    retrieved_texts = [' '.join(self.chunks[i].split('\n')) for i in I[0]]  # Replacing newlines with spaces

    return retrieved_texts

In [6]:
class RAG:
    def __init__(self,
                 file_path,
                 device,
                 context_model_name="facebook/dpr-ctx_encoder-multiset-base",
                 question_model_name="facebook/dpr-question_encoder-multiset-base",
                 generator_name="valhalla/bart-large-finetuned-squadv1"):

      self.generator_name = generator_name
      self.generator_tokenizer = AutoTokenizer.from_pretrained(generator_name)
      self.generator_model = BartForQuestionAnswering.from_pretrained(generator_name).to(device)

      self.retriever = Retriever(file_path, device, context_model_name, question_model_name)
      self.retriever.load_chunks()
      self.retriever.load_context_embeddings()

    def abstractive_query(self, question):
      self.generator_tokenizer = BartTokenizer.from_pretrained(self.generator_name)
      self.generator_model = BartForConditionalGeneration.from_pretrained(self.generator_name).to(device)
      context = self.retriever.retrieve_top_k(question, k=5)
      # input_text = question + " " + " ".join(context)
      input_text = "answer: " + " ".join(context) + " " + question

      inputs = self.generator_tokenizer.encode(input_text, return_tensors='pt', max_length=500, truncation=True).to(device)
      outputs = self.generator_model.generate(inputs, max_length=150, min_length=2, length_penalty=2.0, num_beams=4, early_stopping=True)
      answer = self.generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

      return answer

    def extractive_query(self, question):
      context = self.retriever.retrieve_top_k(question, k=15)
      inputs = self.generator_tokenizer(question, "context: " + ". ".join(context), return_tensors="pt", truncation=True, padding=True)
      with torch.no_grad():
        model_inputs = inputs.to(device)
        outputs = self.generator_model(**model_inputs)

      answer_start_index = outputs.start_logits.argmax()
      answer_end_index = outputs.end_logits.argmax()

      if answer_end_index < answer_start_index:
        answer_start_index, answer_end_index = answer_end_index, answer_start_index

      predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
      answer = self.generator_tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
      answer = answer.replace('\n', ' ').strip()
      answer = answer.replace('$', '')

      return answer

In [7]:
context_model_name="facebook/dpr-ctx_encoder-single-nq-base"
question_model_name = "facebook/dpr-question_encoder-single-nq-base"
# context_model_name="facebook/dpr-ctx_encoder-multiset-base"
# question_model_name="facebook/dpr-question_encoder-multiset-base"

In [8]:
retriever = Retriever(file_path, device, context_model_name, question_model_name)
retriever.load_chunks()
retriever.load_context_embeddings()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

In [9]:
queries = [
    "What is the self-attention mechanism also known as?",
    "What is another name for self-attention?",
    "What are the benefits of using multiple attention heads?",
    "Why do we use multiple attention heads?",
    "What is the benefit of multi-head attention?",
    "Can you explain the Transformer architecture to me in simple terms?",
    "How is the self-attention mechanism different from other attention mechanisms?",
    "In what ways does self-attention improve model performance?",
    "What's the purpose behind using self-attention in the Transformer?",
    "What problem does multi-head attention solve in the Transformer architecture?",
    "How does the Transformer model use position encodings?",
    "What are the main components of the Transformer architecture?",
    "Describe the role of key-value pairs in the attention mechanism.",
    "How does attention mechanism handle sequence order?",
    "Why are positional encodings crucial in Transformers?",
    "Can you outline the advantages of the Transformer model over RNNs?",
]
query = queries[0]

In [10]:
for i, context in enumerate(retriever.retrieve_top_k(query)):
  print(f'{i+1}: \t {context}\n')


1: 	 described in section 3.2. Self-attention, sometimes called intra-attention is an attention mechanism relating different positions

2: 	 attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.

3: 	 Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been

4: 	 3.2 Attention An attention function can be described as mapping a query and a set of key-value pairs to an output,

5: 	 during training. 4 Why Self-Attention In this section we compare various aspects of self-attention layers to the recurrent and convolu-

6: 	 attention over the output of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention

7: 	 because it may allow the model to extrapolate 

In [11]:
rag = RAG(file_path, device)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were no

In [12]:

for query in queries:
  print(f'Question: {query} \n Answer: {rag.extractive_query(query)}')
  print("\n")

Question: What is the self-attention mechanism also known as? 
 Answer: intra-attention is an attention mechanism relating different positions. attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.. Self-attention, sometimes called intra-attention


Question: What is another name for self-attention? 
 Answer: intra-attention


Question: What are the benefits of using multiple attention heads? 
 Answer: reduced effective resolution


Question: Why do we use multiple attention heads? 
 Answer: to efficiently handle large inputs and outputs


Question: What is the benefit of multi-head attention? 
 Answer: significantly faster


Question: Can you explain the Transformer architecture to me in simple terms? 
 Answer: stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,. To evaluate the importance of different components of the T

Note on text splitter:
- Prioritizing periods (".") over whitespace (" ") led to more cohesive English phrases but also led to more chunks with only decimal numbers
- Including newlines ("\n") led to undesirable behavior for chunking figures/diagrams as many of the components are separated by a newline