<a href="https://colab.research.google.com/github/richardOlson/nlp__tranformers/blob/main/squad_data_set.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
# looking at the squad dataset
# getting some of the imports
import requests
import json
import pandas as pd

import torch
from typing import Optional, Union



In [2]:
files = ['train-v2.0.json', 'dev-v2.0.json']

In [3]:
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

In [4]:
# now getting the files 
for file in files:
  res = requests.get(url + file)
  if res.status_code == 200:
    
    with open(file, mode="wb") as f:
     # writing as a chunks 
     for chunk in res.iter_content(chunk_size=50):
       f.write(chunk)



In [5]:
# reading in one of the files to be used with json
with open("dev-v2.0.json", mode="rb")as f:
  dev = json.load(f)

with open("train-v2.0.json", mode="rb")as f:
  train = json.load(f)

In [6]:
train.keys()

dict_keys(['version', 'data'])

In [7]:
train["data"][0]["title"]

'Beyoncé'

In [8]:
squad_list = []

Getting the data into a format that can be used

In [9]:
# now getting the info out of the train
for paragraph_dict in train["data"]:
  for context_with_qa_dict in paragraph_dict["paragraphs"]:
    context = context_with_qa_dict["context"]
    # doing the looping through the question and the answers dictionaries
    for qa_pair_dict in context_with_qa_dict["qas"]:
      if "answers" in qa_pair_dict and len(qa_pair_dict["answers"]) > 0:
          answer = qa_pair_dict['answers'][0]['text']
      elif "plausible_answers" in qa_pair_dict and len(qa_pair_dict["plausible_answers"]) > 0:
        answer = qa_pair_dict["plausible_answers"][0]["text"]
      else:
        answer = None
      # now making the dictionary that will be added to the list 
      squad_list.append({"context": context, "question": qa_pair_dict["question"], "answer": answer})
        
      


In [10]:
len(squad_list)

130319

In [None]:
# creating the dataframe
df = pd.DataFrame(squad_list)

In [None]:
print(df.shape)
df.head()

(130319, 3)


Unnamed: 0,context,question,answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [None]:
df.loc[0, "context"]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [11]:
# now will save the data into a json type object
with open("train.json", mode="w", )as f:
  json.dump(squad_list, f)

In [12]:
dev_list = []

Making the data that for the dev into the format same as the 
train data.

In [13]:
# doing the dev to make it so that it is in the same format as the 
# train.
for paragraph_dict in dev["data"]:
  for each_context_dict in paragraph_dict["paragraphs"]:
    context = each_context_dict["context"]
    for qa_pair_dict in each_context_dict["qas"]:
      if "answers" in qa_pair_dict and len(qa_pair_dict["answers"]) > 0:
        # doing the looping through the answers
        answer_list = qa_pair_dict["answers"]
      elif "plausible_answers" in qa_pair_dict and len(qa_pair_dict["plausible_answers"]) > 0:
        answer_list = qa_pair_dict["plausible_answers"]
      else:
        answer_list = []
      # now doing the making of the list  
      answer = [item["text"] for item in answer_list]
      # removing any duplicates
      answer = list(set(answer))
      
      # adding to the dev list
      dev_list.append({"context": context, "question": qa_pair_dict["question"], "answer": answer})


In [14]:
len(dev_list)

11873

In [15]:
# building a dataframe of the dev_list
dev_dataframe = pd.DataFrame(dev_list)
dev_dataframe.head()

Unnamed: 0,context,question,answer
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,[France]
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[in the 10th and 11th centuries, 10th and 11th..."
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"[Denmark, Iceland and Norway]"
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,[Rollo]
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"[10th century, 10th, the first half of the 10t..."


In [16]:
# making a json file of the new formatted dev
with open("dev.json", mode="w")as f:
  json.dump(dev_list, f)

In [17]:
# now making the first model 
! pip install transformers -q
import transformers
from transformers import BertForQuestionAnswering, BertTokenizer
from transformers import pipeline

[K     |████████████████████████████████| 2.6 MB 7.7 MB/s 
[K     |████████████████████████████████| 895 kB 55.2 MB/s 
[K     |████████████████████████████████| 636 kB 69.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 51.2 MB/s 
[?25h

In [18]:
# making the model
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")
tokenizer = BertTokenizer.from_pretrained("deepset/bert-base-cased-squad2")

Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/152 [00:00<?, ?B/s]

In [None]:
# going to do it two ways -- one with using the pipelines and the other without the pipeline in 
# transformers

model.

{'input_ids': tensor([[7, 6, 0, 0, 1],
         [1, 2, 3, 0, 0],
         [0, 0, 0, 4, 5]])}

In [19]:
# the span of the questions that we will send into the tokenizer
my_span = squad_list[:5]
my_span

[{'answer': 'in the late 1990s',
  'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
  'question': 'When did Beyonce start becoming popular?'},
 {'answer': 'singing and dancing',
  'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress

In [43]:
# putting the 
questions = [item["question"] for item in my_span]
contexts = [item["context"] for item in my_span]
answers = [item["answer"] for item in my_span]

In [None]:
type(contexts[0])
ques

str

In [21]:
# making the function that will make the "token_type_ids"
def get_token_type_ids(input_id:list):
  t_index = input_id.index(tokenizer.sep_token_id)
  # everything from 0 upto and including the sep token are in the first segment
  front = [0] * t_index
  back = [1] * (len(input_id) - len(front))
  c = front + back
  return c

In [22]:
questions

['When did Beyonce start becoming popular?',
 'What areas did Beyonce compete in when she was growing up?',
 "When did Beyonce leave Destiny's Child and become a solo singer?",
 'In what city and state did Beyonce  grow up? ',
 'In which decade did Beyonce become famous?']

In [24]:
# doing some encoding of the text
input_ids = tokenizer.encode(questions[0], contexts[0])
t = len(input_ids)
print(f"The type of the input_ids is {type(input_ids)}")
print(f"The length of the input_ids is: {t}")


The type of the input_ids is <class 'list'>
The length of the input_ids is: 173


In [None]:
tokenizer.cls_token_id

101

In [None]:
contexts[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [53]:
# making a function that will be doing the encoding and print out the 
# question and the answers


def answer_question(question:[list, str], context:[list, str], 
                    show_real_answer: Optional[Union[list, str]]=None, no_print=False, return_as_dict=False):
  # single context is a flag to wheather their is just on context for all 
  # the questions
  """
  question:  These are the questions passed in can be a string or a list of strings.

  context:  The context:  can be a list of strings or just one string to be used repeatedly
  for each of the questions.

  show_real_answer:  if not None then will be a list of the real answers or just a single answer
  that will be printed out with the predicted answer.

  no_print:   if set to True then the answer and the question and the real_answer are 
  returned in a dictionary if return_as_dict is set to true or else they are 
  returned as part of a generator.

  """
  single_context = True
  imput_ids = None
  return_dict = None

  if return_as_dict:
    return_dict = {"pred_ans":[],
                   "questions":[]}

  if isinstance(context, list):
    single_context = False

  if not isinstance(question, list):
    question = [question]

  if show_real_answer:
    return_dict["true_ans"] = []

    if isinstance(show_real_answer, str):
      show_real_answer = [show_real_answer]
  
  
  # running the for loop
  for i, q in enumerate(question):
    if single_context:
      input_ids = tokenizer.encode(q,context)
    else:
      input_ids = tokenizer.encode(q, context[i])
    
    # now getting the list of "token_type_ids"
    token_type_ids = get_token_type_ids(input_ids)
    # running them in the model
    output = model(torch.tensor([input_ids]), 
                   token_type_ids=torch.tensor([token_type_ids]), return_dict=True)
    # getting the span
    start = output.start_logits
    end = output.end_logits
    # creating the span
    answer_start = torch.argmax(start)
    answer_end = torch.argmax(end) + 1 # the + 1 is added to make when doing a 
                                      # a span getting the right length

    # getting the answer for the question
    ans = tokenizer.decode(input_ids[answer_start: answer_end])
    if not no_print:
      # printing out the question and the answer
      print(f"question:  {q}")
      print(f"answer:  {ans}")
      if show_real_answer:
        print(f"The real answer is:  {show_real_answer[i]}")
      print()
    elif return_as_dict:
      # adding to the dictionary
      qa["pred_ans"].append(ans)
      qa["questions"].append(q)
      if show_real_answer != None:
        qa["true_ans"].append(show_real_answer[i])
    else:
      if show_real_answer != None:
        yield 

  

In [41]:
# using the function above
answer_question(question= questions[0], context=contexts[0])


When did Beyonce start becoming popular?
late 1990s


In [48]:
# running through the first five in the squad
answer_question(question=questions, context=contexts, show_real_answer=answers)

question:  When did Beyonce start becoming popular?
answer:  late 1990s
The real answer is:  in the late 1990s

question:  What areas did Beyonce compete in when she was growing up?
answer:  singing and dancing
The real answer is:  singing and dancing

question:  When did Beyonce leave Destiny's Child and become a solo singer?
answer:  2003
The real answer is:  2003

question:  In what city and state did Beyonce  grow up? 
answer:  Houston, Texas
The real answer is:  Houston, Texas

question:  In which decade did Beyonce become famous?
answer:  1990s
The real answer is:  late 1990s



In [28]:
# now using the pipeline to do the same thing
myPipe = pipeline(task="question-answering", model=model, tokenizer=tokenizer)


In [29]:
# finding out the answer with the pipeline
myPipe(context=contexts[0], question=questions[0])

{'answer': 'late 1990s', 'end': 286, 'score': 0.562135636806488, 'start': 276}

In [50]:
# using the pipe for the first five of the squad
# dataset
for i in range(5):
  print(questions[i])
  print(myPipe({"question":questions[i], "context":contexts[i]}))
  print(f"The real answer is:  {answers[i]}")
  print()

When did Beyonce start becoming popular?
{'score': 0.562135636806488, 'start': 276, 'end': 286, 'answer': 'late 1990s'}
The real answer is:  in the late 1990s

What areas did Beyonce compete in when she was growing up?
{'score': 0.9938411116600037, 'start': 207, 'end': 226, 'answer': 'singing and dancing'}
The real answer is:  singing and dancing

When did Beyonce leave Destiny's Child and become a solo singer?
{'score': 0.996565580368042, 'start': 525, 'end': 532, 'answer': '(2003),'}
The real answer is:  2003

In what city and state did Beyonce  grow up? 
{'score': 0.8477826118469238, 'start': 166, 'end': 181, 'answer': 'Houston, Texas,'}
The real answer is:  Houston, Texas

In which decade did Beyonce become famous?
{'score': 0.677906334400177, 'start': 281, 'end': 286, 'answer': '1990s'}
The real answer is:  late 1990s



In [52]:
# will make another set of the questions and answers and context for about 100 
# in the squad list and then will run them through the answer function and will
# total up the amount of exact matches
qa = {
    "questions":[],
    "answers": [], 
    "contexts": []
}
for item in squad_list[:100]:
  qa["questions"].append(item["question"])
  qa["answers"].append(item["answer"])
  qa["contexts"].append(item["context"])

print(len(qa["contexts"]))

100
