In [1]:
pip install transformers



In [2]:
import pandas as pd
import numpy as np

In [3]:
def json_to_df(json_file):
  arrayForDF = []
  for current_subject in json_file['data']:
    subject=current_subject['title']
    for current_context in current_subject['paragraphs']:
      context = current_context['context']
      for current_question in current_context['qas']:
        question = current_question['question']
        for answer in current_question["answers"]:
          answer_text=answer['text']
          answer_start= answer['answer_start']
          record = {
                  "answer_text": answer_text,
                  'answer_start':answer_start,
                  "question":question,
                  "context": context,
                  'subject':subject
              }
          arrayForDF.append(record)
  df=pd.DataFrame(arrayForDF)
  return df

In [4]:
df2=pd.read_json('/content/train-v1.1.json')
test_df=json_to_df(df2)
test_df=test_df.iloc[801:1001]

In [5]:
test_df.head(5)

Unnamed: 0,answer_text,answer_start,question,context,subject
801,Tom Ford's Spring/Summer 2011 fashion show,62,Beyonce's first modelling event was at where?,"In September 2010, Beyoncé made her runway mod...",Beyoncé
802,People,154,"""World's Most Beautiful woman"" was declared to...","In September 2010, Beyoncé made her runway mod...",Beyoncé
803,January 2013,228,Which month and year did GQ feature Beyonce on...,"In September 2010, Beyoncé made her runway mod...",Beyoncé
804,VH1,339,What TV network listed Beyonce as number 1 on ...,"In September 2010, Beyoncé made her runway mod...",Beyoncé
805,People,154,Who called Beyonce the World's most Beautiful ...,"In September 2010, Beyoncé made her runway mod...",Beyoncé


In [6]:
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
import tensorflow as tf

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [7]:
predicted_answer=[]
for i in range(len(test_df)):
  question=test_df['question'].iloc[i]
  text=test_df['context'].iloc[i]
  inputs = tokenizer(question, text, return_tensors="tf")
  outputs = model(**inputs)

  answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
  answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

  predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
  predicted_answer.append(tokenizer.decode(predict_answer_tokens))

In [8]:
scores={'correct':0, 'incorrect':0}
for i in range(len(test_df)):
  if predicted_answer[i] ==test_df['answer_text'].iloc[i]:
    scores['correct'] += 1
  else:
    scores['incorrect'] += 1
print(scores)
print(scores['correct']/test_df.shape[0])

{'correct': 147, 'incorrect': 53}
0.735


In [9]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [10]:
predicted_answer=[]
for i in range(len(test_df)):
  question=test_df['question'].iloc[i]
  text=test_df['context'].iloc[i]
  QA_input = {'question':question,'context':text}
  result = nlp(QA_input)
  predicted_answer.append(result['answer'])

In [11]:
scores={'correct':0, 'incorrect':0}
for i in range(len(test_df)):
  if predicted_answer[i] ==test_df['answer_text'].iloc[i]:
    scores['correct'] += 1
  else:
    scores['incorrect'] += 1
print(scores)
print(scores['correct']/test_df.shape[0])

{'correct': 150, 'incorrect': 50}
0.75


In [16]:
test_df['context'].iloc[2]

'In September 2010, Beyoncé made her runway modelling debut at Tom Ford\'s Spring/Summer 2011 fashion show. She was named "World\'s Most Beautiful Woman" by People and the "Hottest Female Singer of All Time" by Complex in 2012. In January 2013, GQ placed her on its cover, featuring her atop its "100 Sexiest Women of the 21st Century" list. VH1 listed her at number 1 on its 100 Sexiest Artists list. Several wax figures of Beyoncé are found at Madame Tussauds Wax Museums in major cities around the world, including New York, Washington, D.C., Amsterdam, Bangkok, Hollywood and Sydney.'