# Extractive Question Answering on Squad 2.0 dataset using T5-base model.

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Checking if cuda is accessible or not, if not switch to gpu accelerators.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
cuda_available = torch.cuda.is_available()

if cuda_available:
    print("CUDA is available.")
else:
    print("CUDA is not available.")

In [None]:
#Switching the device to cuda for GPU accelerator
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
torch.set_default_device("cuda")

In [None]:
#Accessing the model from transformers library
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Set the device to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model1_name = "t5-base" #the model selected is t5_base from transformers library
tokenizer1 = T5Tokenizer.from_pretrained(model1_name)
model1 = T5ForConditionalGeneration.from_pretrained(model1_name).to(device)

**Sample Extractive question and Answer**

In [None]:
# Defining the question and context
question = "why is sky blue,precisely"
context = "The sky appears blue due to the scattering of sunlight off the atmosphere. The Earth's atmosphere is composed of various gases and particles. When sunlight hits the atmosphere, shorter blue wavelengths are scattered in all directions by the gases and particles, making the sky appear blue to our eyes."

# Formatting the input according to T5's text-to-text format
input_text = f"question: {question} context: {context}"

# Tokenize inputs
inputs = tokenizer1(input_text, return_tensors="pt", truncation=True, padding=True)

# Forward pass through the model
with torch.no_grad():
    outputs = model1.generate(**inputs)

# Decode the generated answer
answer = tokenizer1.decode(outputs[0], skip_special_tokens=True)

print("Extractive QA Answer:", answer)

**Checking the input drives for model and dataset(Squad 2.0)**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json # to read json


from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

**Function to convert squad dataset from json to dataframe**

In [None]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([m[['id','question','context']].set_index('id'), js.set_index('q_idx')], axis=1, sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
def squad_json_to_dataframe_dev(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

**Creation of Training dataset from squad**

In [None]:
# training data
input_file_path = '/kaggle/input/squad-20/train-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

**Visualising the data and understanding the columns and rows**

In [None]:
train['context'][0]

**Sample extraction and comparison with actual target response**

In [None]:
input_text = f"question: {train['question'][0]} context: {train['context'][0]}"

inputs = tokenizer1(input_text, return_tensors="pt", truncation=True, padding=True)

with torch.no_grad():
    outputs = model1.generate(**inputs)

answer = tokenizer1.decode(outputs[0], skip_special_tokens=True)

print("Extractive QA Answer:", answer)
print("Actual QA Answer:", train['text'][0])

****Extractive question answering for 50,0000 instances of squad dataset and storing the responses as squad_with_t5_2_csv *************

In [None]:
for i in range(1,50000):
    input_text = f"question: {train['question'][i]} context: {train['context'][i]}"
    inputs = tokenizer1(input_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model1.generate(**inputs)

    answer = tokenizer1.decode(outputs[0], skip_special_tokens=True)
    
    print(f"{i}",end="-")

    new_entry = {
        'Context': train['context'][i],
        'Question': train['question'][i],
        'Response': train['text'][i],
        'Response by T5':answer
    }

    with open('squad_with_t5_2.csv', 'a') as f:
        pd.DataFrame([new_entry]).to_csv(f, header=f.tell()==0, index=False)