In [58]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
class QA:
    """
    HuggingFace BERT language model pre-trained on SQUAD.
    Ref: https://huggingface.co/transformers/index.html

    How does BERT answer questions?
    Ref: https://openreview.net/pdf?id=SygMXE2vAE
    """
    def __init__(self, text_file):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        with open(text_file, 'r') as file:
            self.passage = file.read().replace('\n', ' ')

    def ask(self, question, threshold=1.0):
        """Ask question to QA."""
        score, answer = self.query(question)
        print("NLP score:", score)
        print("Answer:", answer)

        if score > threshold:
            return answer
        else:
            return None

    def query(self, question):
      """
      Query question with reference to the previously given passage.
      Returns (score, answer)
      """
      # Truncate the input passage to fit within the maximum sequence length
      max_passage_length = self.tokenizer.model_max_length
      truncated_passage = self.passage[:max_passage_length - len(question) - 10]  # Subtracting for "[CLS] ", "[SEP] ", and some buffer
      input_text = "[CLS] " + question + " [SEP] " + truncated_passage + " [SEP]"

      input_ids = self.tokenizer.encode(input_text)
      token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
      outputs = self.model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))

      start_logits = outputs.start_logits
      end_logits = outputs.end_logits

      # Convert start_logits and end_logits to tensors if they are not already
      if not isinstance(start_logits, torch.Tensor):
          start_logits = torch.tensor(start_logits)
      if not isinstance(end_logits, torch.Tensor):
          end_logits = torch.tensor(end_logits)

      all_tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
      start_index = torch.argmax(start_logits)
      end_index = torch.argmax(end_logits) + 1
      score = self.compute_score(start_logits, end_logits)
      answer = ' '.join(all_tokens[start_index: end_index])

      return score, answer




    def compute_score(self, start_scores, end_scores):
        """
        Compute the final score based on start and end scores.
        """
        try:
            start_scores = torch.nn.functional.softmax(start_scores, dim=1)
            end_scores = torch.nn.functional.softmax(end_scores, dim=1)
            score = torch.max(start_scores) + torch.max(end_scores)
            return round(score.item(), 3)
        except Exception as e:
            print("Error computing score:", e)
            return 0.0


In [59]:
qa = QA("/content/QACorpus.txt")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [63]:
score, answer = qa.query("How many USP admits Each year ?")
print("Answer:", answer)
print("Score:", score)

Answer: around 200
Score: 1.486


In [62]:
score, answer = qa.query("How much discount is given for school fees?")
print("Answer:", answer)
print("Score:", score)

Answer: 200
Score: 0.331
