In [1]:
# looking at the squad dataset
# getting some of the imports
import requests
import json
import pandas as pd

import torch
from typing import Optional, Union
import pickle



In [2]:
files = ['train-v2.0.json', 'dev-v2.0.json']

In [3]:
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

In [4]:
# now getting the files 
for file in files:
  res = requests.get(url + file)
  if res.status_code == 200:
    
    with open(file, mode="wb") as f:
     # writing as a chunks 
     for chunk in res.iter_content(chunk_size=50):
       f.write(chunk)



In [7]:
# reading in one of the files to be used with json
with open("dev-v2.0.json", mode="rb")as f:
  dev = json.load(f)

with open("train-v2.0.json", mode="rb")as f:
  train = json.load(f)

In [None]:
train.keys()

dict_keys(['version', 'data'])

In [None]:
train["data"][0]["title"]

'Beyoncé'

In [8]:
squad_list = []
context_list = []

Getting the data into a format that can be used

In [11]:
# now getting the info out of the train
for paragraph_dict in train["data"]:
  for context_with_qa_dict in paragraph_dict["paragraphs"]:
    context = context_with_qa_dict["context"]
    # doing the looping through the question and the answers dictionaries
    for qa_pair_dict in context_with_qa_dict["qas"]:
      if "answers" in qa_pair_dict and len(qa_pair_dict["answers"]) > 0:
          answer = qa_pair_dict['answers'][0]['text']
      elif "plausible_answers" in qa_pair_dict and len(qa_pair_dict["plausible_answers"]) > 0:
        answer = qa_pair_dict["plausible_answers"][0]["text"]
      else:
        answer = None
      # now making the dictionary that will be added to the list 
      squad_list.append({"context": context, "question": qa_pair_dict["question"], "answer": answer})
      # making a list that just contains the context in a list
      context_list.append(context)
      


In [8]:
len(squad_list)

130319

In [15]:
# saving 10000 of the contexts from the squad list
with open("contexts", mode="wb") as f:
    pickle.dump(context_list, f)

In [None]:
# creating the dataframe
df = pd.DataFrame(squad_list)

In [None]:
print(df.shape)
df.head()

(130319, 3)


Unnamed: 0,context,question,answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [None]:
df.loc[0, "context"]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [8]:
# now will save the data into a json type object
with open("train_squad.json", mode="w", )as f:
  json.dump(squad_list, f)

In [5]:
dev_list = []

Making the data that for the dev into the format same as the 
train data.

In [9]:
# doing the dev to make it so that it is in the same format as the 
# train.
for paragraph_dict in dev["data"]:
  for each_context_dict in paragraph_dict["paragraphs"]:
    context = each_context_dict["context"]
    for qa_pair_dict in each_context_dict["qas"]:
      if "answers" in qa_pair_dict and len(qa_pair_dict["answers"]) > 0:
        # doing the looping through the answers
        answer_list = qa_pair_dict["answers"]
      elif "plausible_answers" in qa_pair_dict and len(qa_pair_dict["plausible_answers"]) > 0:
        answer_list = qa_pair_dict["plausible_answers"]
      else:
        answer_list = []
      # now doing the making of the list  
      answer = [item["text"] for item in answer_list]
      # removing any duplicates
      answer = list(set(answer))
      
      # adding to the dev list
      dev_list.append({"context": context, "question": qa_pair_dict["question"], "answer": answer})


In [11]:
len(dev_list)

11873

In [None]:
# building a dataframe of the dev_list
dev_dataframe = pd.DataFrame(dev_list)
dev_dataframe.head()

Unnamed: 0,context,question,answer
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,[France]
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[in the 10th and 11th centuries, 10th and 11th..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"[Denmark, Iceland and Norway]"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,[Rollo]
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"[10th century, 10th, the first half of the 10t..."


In [12]:
# making a json file of the new formatted dev
with open("dev_squad.json", mode="w")as f:
  json.dump(dev_list, f)

In [13]:
# now making the first model 
#pip install transformers -q
import transformers
from transformers import BertForQuestionAnswering, BertTokenizer
from transformers import pipeline

In [14]:
# making the model
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")
tokenizer = BertTokenizer.from_pretrained("deepset/bert-base-cased-squad2")

In [18]:
# the span of the questions that we will send into the tokenizer
my_span = squad_list[:5]
my_span

[{'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'question': 'When did Beyonce start becoming popular?',
  'answer': 'in the late 1990s'},
 {'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas

In [19]:
# putting the 
questions = [item["question"] for item in my_span]
contexts = [item["context"] for item in my_span]
answers = [item["answer"] for item in my_span]

In [16]:
# making the function that will make the "token_type_ids"
# The token type ids are used by hugging face to use the bert model
# to answer the question.
def get_token_type_ids(input_id:list):
  t_index = input_id.index(tokenizer.sep_token_id)
  # everything from 0 upto and including the sep token are in the first segment
  front = [0] * t_index
  back = [1] * (len(input_id) - len(front))
  c = front + back
  return c

In [None]:
questions

['When did Beyonce start becoming popular?',
 'What areas did Beyonce compete in when she was growing up?',
 "When did Beyonce leave Destiny's Child and become a solo singer?",
 'In what city and state did Beyonce  grow up? ',
 'In which decade did Beyonce become famous?']

In [None]:
# doing some encoding of the text
input_ids = tokenizer.encode(questions[0], contexts[0])
t = len(input_ids)
print(f"The type of the input_ids is {type(input_ids)}")
print(f"The length of the input_ids is: {t}")


The type of the input_ids is <class 'list'>
The length of the input_ids is: 173


In [None]:
tokenizer.cls_token_id

101

In [None]:
contexts[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [17]:
# making a function that will be doing the encoding and print out the 
# question and the answers


def answer_question(question:[list, str], context:[list, str], 
                    show_real_answer: Optional[Union[list, str]]=None, printing=True):
  # single context is a flag to wheather their is just on context for all 
  # the questions
  """
  question:  These are the questions passed in can be a string or a list of strings.

  context:  The context:  can be a list of strings or just one string to be used repeatedly
  for each of the questions.

  show_real_answer:  if not None then will be a list of the real answers or just a single answer
  that will be printed out with the predicted answer.

  print:   if set to False then the answer and the question and the real_answer are 
  returned in a dictionary if return_as_dict is set to true or else they are 
  returned as part of a generator.

 

  """
  
  single_context = True
  imput_ids = None
  qa = None 

  if printing == False:
    qa = {"pred_ans":[],
                   "questions":[]}
    if show_real_answer != None:
      qa["true_ans"] = []

  if isinstance(context, list):
    single_context = False

  if not isinstance(question, list):
    question = [question]

  if isinstance(show_real_answer, str):
    show_real_answer = [show_real_answer]
  
  
  # running the for loop
  for i, q in enumerate(question):
    if single_context:
      input_ids = tokenizer.encode(q,context, # padding="max_length", 
                                   truncation=True,)
    else:
      input_ids = tokenizer.encode(q, context[i], # padding="max_length", 
                                   truncation=True,
                                   )
    
    # now getting the list of "token_type_ids"
    token_type_ids = get_token_type_ids(input_ids)

    if len(input_ids) != len(token_type_ids):
      continue 
    # running them in the model
    output = model(torch.tensor([input_ids]), 
                   token_type_ids=torch.tensor([token_type_ids]), return_dict=True)
    # getting the span
    start = output.start_logits
    end = output.end_logits
    # creating the span
    answer_start = torch.argmax(start)
    answer_end = torch.argmax(end) + 1 # the + 1 is added to make when doing a 
                                      # a span getting the right length

    # getting the answer for the question
    ans = tokenizer.decode(input_ids[answer_start: answer_end])
    
    if printing:
      
      # printing out the question and the answer
      print(f"question:  {q}")
      print(f"answer:  {ans}")
      if show_real_answer:
        print(f"The real answer is:  {show_real_answer[i]}")
      print()
    elif not printing:
      # adding to the dictionary
      qa["pred_ans"].append(ans)
      qa["questions"].append(q)
      if show_real_answer != None:
        qa["true_ans"].append(show_real_answer[i])

  if not printing:
    return qa
    
  

In [19]:
# using the function above
answer_question(question= questions[0], context=contexts[0])


question:  When did Beyonce start becoming popular?
answer:  late 1990s



In [20]:
# running through the first five in the squad
answer_question(question=questions, context=contexts, show_real_answer=answers)

question:  When did Beyonce start becoming popular?
answer:  late 1990s
The real answer is:  in the late 1990s

question:  What areas did Beyonce compete in when she was growing up?
answer:  singing and dancing
The real answer is:  singing and dancing

question:  When did Beyonce leave Destiny's Child and become a solo singer?
answer:  2003
The real answer is:  2003

question:  In what city and state did Beyonce  grow up? 
answer:  Houston, Texas
The real answer is:  Houston, Texas

question:  In which decade did Beyonce become famous?
answer:  1990s
The real answer is:  late 1990s



In [21]:
# now using the pipeline to do the same thing
myPipe = pipeline(task="question-answering", model=model, tokenizer=tokenizer)


In [None]:
# finding out the answer with the pipeline
myPipe(context=contexts[0], question=questions[0])

{'answer': 'late 1990s', 'end': 286, 'score': 0.562135636806488, 'start': 276}

In [None]:
# using the pipe for the first five of the squad
# dataset
for i in range(5):
  print(questions[i])
  print(myPipe({"question":questions[i], "context":contexts[i]}))
  print(f"The real answer is:  {answers[i]}")
  print()

When did Beyonce start becoming popular?
{'score': 0.562135636806488, 'start': 276, 'end': 286, 'answer': 'late 1990s'}
The real answer is:  in the late 1990s

What areas did Beyonce compete in when she was growing up?
{'score': 0.9938411116600037, 'start': 207, 'end': 226, 'answer': 'singing and dancing'}
The real answer is:  singing and dancing

When did Beyonce leave Destiny's Child and become a solo singer?
{'score': 0.996565580368042, 'start': 525, 'end': 532, 'answer': '(2003),'}
The real answer is:  2003

In what city and state did Beyonce  grow up? 
{'score': 0.8477826118469238, 'start': 166, 'end': 181, 'answer': 'Houston, Texas,'}
The real answer is:  Houston, Texas

In which decade did Beyonce become famous?
{'score': 0.677906334400177, 'start': 281, 'end': 286, 'answer': '1990s'}
The real answer is:  late 1990s



In [22]:
# will make another set of the questions and answers and context for about 100
# in the squad list and then will run them through the answer function and will
# total up the amount of exact matches
qa = {
    "questions":[],
    "answers": [], 
    "contexts": []
}
for item in squad_list[:100]:
  qa["questions"].append(item["question"])
  qa["answers"].append(item["answer"])
  qa["contexts"].append(item["context"])

print(len(qa["contexts"]))

100


In [22]:
# running through the questions and the answers and add up the number
# of exact matches.
ans_dict = answer_question(question=qa["questions"], show_real_answer=qa["answers"], context=qa["contexts"], printing=False)

In [None]:
exact_match = [] # this list will contain weather there is an exact match
                # 1 will be appended if is an exact match and 0 if not.
for i in range(len(ans_dict["questions"])):
  if ans_dict["pred_ans"][i] == ans_dict["true_ans"][i]:
    exact_match.append(1)
  else:
    exact_match.append(0)

In [None]:
for i in range(len(ans_dict["questions"])):
  pred = ans_dict["pred_ans"][i]
  ans = ans_dict["true_ans"][i]
  print(f"{exact_match[i]}  pred = {pred}   true = {ans}")

0  pred = late 1990s   true = in the late 1990s
1  pred = singing and dancing   true = singing and dancing
1  pred = 2003   true = 2003
1  pred = Houston, Texas   true = Houston, Texas
0  pred = 1990s   true = late 1990s
1  pred = Destiny's Child   true = Destiny's Child
1  pred = Dangerously in Love   true = Dangerously in Love
1  pred = Mathew Knowles   true = Mathew Knowles
1  pred = late 1990s   true = late 1990s
1  pred = lead singer   true = lead singer
1  pred = Dangerously in Love   true = Dangerously in Love
1  pred = 2003   true = 2003
1  pred = five   true = five
1  pred = lead singer   true = lead singer
1  pred = Dangerously in Love   true = Dangerously in Love
1  pred = acting   true = acting
1  pred = Jay Z   true = Jay Z
1  pred = six   true = six
1  pred = Dreamgirls   true = Dreamgirls
1  pred = 2010   true = 2010
1  pred = Beyoncé   true = Beyoncé
1  pred = Cadillac Records   true = Cadillac Records
1  pred = June 2005   true = June 2005
1  pred = B'Day   true = B'Da

In [None]:
# summing up the exact_match
the_sum = sum(exact_match)
the_sum

theLen = len(exact_match)
# the percentage of exact match
print(f"Out of {theLen}, {the_sum} are an exact match making the percentage as {(the_sum/theLen) * 100}%")

Out of 100, 90 are an exact match making the percentage as 90.0%


In [2]:
# we are going to regex to filter out those matches of punctuations and such
import re

In [23]:
exact_match = []
# looping from the predicted and true answers found in the dictionary
for i in range(len(ans_dict["pred_ans"])):
  pred = re.sub("[^0-9a-z ]", "", ans_dict["pred_ans"][i].lower())
  true = re.sub("[^0-9a-z ]", "", ans_dict["true_ans"][i].lower())

  if pred == true:
    exact_match.append(1)
  else:
    exact_match.append(0)


In [24]:
# checking to see if this will change the number of correct matches
the_sum = sum(exact_match)
print(f"The number of exact matches is {the_sum}")

The number of exact matches is 92


Going to use the ROUGE to check the exact match.

In [3]:
pip install rouge -q

Note: you may need to restart the kernel to use updated packages.


In [13]:
from rouge import  Rouge

In [32]:
# checking of the difference between two different outputs
model_output = "hello how are you doing?"
real = "hello how are you?"

rouge = Rouge()

rouge.get_scores(model_output, real)

[{'rouge-1': {'f': 0.6666666617283951, 'p': 0.6, 'r': 0.75},
  'rouge-2': {'f': 0.5714285665306124, 'p': 0.5, 'r': 0.6666666666666666},
  'rouge-l': {'f': 0.6666666617283951, 'p': 0.6, 'r': 0.75}}]

Now Using the ROUGE to get scores from the model used above

In [None]:
rouge.get_scores(hyps=pred,  refs=true, avg=True)


{'rouge-1': {'f': 0.999999995, 'p': 1.0, 'r': 1.0},
 'rouge-2': {'f': 0.999999995, 'p': 1.0, 'r': 1.0},
 'rouge-l': {'f': 0.999999995, 'p': 1.0, 'r': 1.0}}

In [None]:
from tqdm import  tqdm

In [18]:
# making a list of the context and the questions and the true_ans
answers = [item["answer"] for item in squad_list]
questions = [item["question"] for item in squad_list]
contexts = [item["context"] for item in squad_list]

In [None]:
# {"context": context, "question": qa_pair_dict["question"], "answer": answer}

In [27]:
# using the answer_question to now do it for 10,000
the_dict = {"true":[],
            "pred":[]
            }
ans = answer_question(question=questions[:10000], context=contexts[:10000], printing=False, show_real_answer=answers[:10000])


  
  



In [None]:
# now saving the ans so we don't have to run the above 
# again
with open("squad_ans", mode="wb") as f:
    pickle.dump(ans, f)

In [9]:
# bringing back from the file the ans that is found in "squad_ans"
with open("squad_ans", mode="rb") as f:
    ans = pickle.load(f)

In [10]:
ans.keys()

dict_keys(['pred_ans', 'questions', 'true_ans'])

In [11]:
# compiling a regex that will be used to clean the lists
# this one will find all things that are not letters and numbers and remove them
clean_compile = re.compile("(?i)[^0-9a-z\s]")

pred_ans = [clean_compile.sub(" ", text) for text in ans["pred_ans"]]
true_ans = [clean_compile.sub(" ", text) for text in ans["true_ans"]]

In [12]:
# looking at the result of the the regex did to clean the pred_ans and the true_ans
pred_ans[:4]

['late 1990s', 'singing and dancing', '2003', 'Houston  Texas']

In [16]:
# will now be putting "None" in the hypothesis (pred_ans) when there is an empty string
# doing another compiled regex
re_compiled = re.compile("(?i)[0-9a-z]")

In [17]:
# looking for those that have empty string as the answer in the pred and replace with "None"
for i in range(len(pred_ans)):
  val = re_compiled.search(pred_ans[i])
  if val == None:
    pred_ans[i] = "None"

In [18]:
# looking to see what the similarity of the the true and the pred ans are
rouge = Rouge()
rouge.get_scores(pred_ans, true_ans, avg=True)

{'rouge-1': {'f': 0.7677928999531958,
  'p': 0.7727694855716959,
  'r': 0.7747437797433163},
 'rouge-2': {'f': 0.5134928407486966,
  'p': 0.5179205629224809,
  'r': 0.5172526962878883},
 'rouge-l': {'f': 0.768314507202797,
  'p': 0.7731343759144244,
  'r': 0.7750138638485702}}

In [19]:
# getting the files where the lists will be saved 
with open("pred_ans_list", mode="wb") as f:
    pickle.dump(pred_ans, f)

with open("true_ans_list", mode="wb")as f:
    pickle.dump(true_ans, f)

Saving the questions for each of the answers

In [20]:
with open("questions_list", mode="wb")as f:
    pickle.dump(ans["questions"], f)

Trying to pull in from the pickled lists to make sure that everything is okay

In [21]:
# ch3cking to see what the true ans and the pred ans looks like
with open("pred_ans_list", mode="rb") as f:
    pred_ans = pickle.load(f)
with open("true_ans_list", mode="rb") as f:
    true_ans = pickle.load(f)

In [22]:
# comparing the true and the pred ans to each other
print(pred_ans[:5])
print()
print(true_ans[:5])

['late 1990s', 'singing and dancing', '2003', 'Houston  Texas', '1990s']

['in the late 1990s', 'singing and dancing', '2003', 'Houston  Texas', 'late 1990s']
