In [39]:
import json
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import nltk
import spacy
import string
import evaluate  # Bleu
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast

import warnings
warnings.filterwarnings("ignore")

In [40]:

TOKENIZER = T5TokenizerFast.from_pretrained("t5-base")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
OPTIMIZER = Adam(MODEL.parameters(), lr=0.00001)
Q_LEN = 256   # Question Length
T_LEN = 32    # Target Length
BATCH_SIZE = 4
DEVICE = "cuda:0"

In [26]:
from datasets import load_dataset
data=load_dataset('GEM/squad_v2')

In [7]:
data

DatasetDict({
    train: Dataset({
        features: ['gem_id', 'id', 'title', 'context', 'question', 'target', 'references', 'answers'],
        num_rows: 116397
    })
    validation: Dataset({
        features: ['gem_id', 'id', 'title', 'context', 'question', 'target', 'references', 'answers'],
        num_rows: 11873
    })
    test: Dataset({
        features: ['gem_id', 'id', 'title', 'context', 'question', 'target', 'references', 'answers'],
        num_rows: 13922
    })
})

In [5]:
# Extracting context, question, and answers from the dataset

def prepare_data(data):
    articles = []

    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"]

                if not qa["is_impossible"]:
                  answer = qa["answers"][0]["text"]

                inputs = {"context": paragraph["context"], "question": question, "answer": answer}


                articles.append(inputs)

    return articles

In [11]:
data['train']['answers'][0]

{'text': ['the Republic of Guinea-Bissau'], 'answer_start': [59]}

In [12]:
data['train']

Dataset({
    features: ['gem_id', 'id', 'title', 'context', 'question', 'target', 'references', 'answers'],
    num_rows: 116397
})

In [33]:
def prepare_data1(data):
    articles = []
    data=data[:20000]
    for x in range(len(data)):


      articles.append({
          'context':data['context'][x],
          'question':data['question'][x],
          'answer':data['answers'][x]['text'][0]
      })

    return articles

In [34]:
data1 = prepare_data1(data['train'])

# Create a Dataframe
data1 = pd.DataFrame(data1)

In [41]:
data1.head()

Unnamed: 0,context,question,answer
0,"Guinea-Bissau (i/ˈɡɪni bɪˈsaʊ/, GI-nee-bi-SOW)...",What is the official name for Guinea-Bissau?,the Republic of Guinea-Bissau
1,"Guinea-Bissau (i/ˈɡɪni bɪˈsaʊ/, GI-nee-bi-SOW)...",Where is Guinea-Bissau located?,West Africa
2,"Guinea-Bissau (i/ˈɡɪni bɪˈsaʊ/, GI-nee-bi-SOW)...",How many square miles is Guinea-Bissau?,"13,948 sq mi"
3,"Guinea-Bissau (i/ˈɡɪni bɪˈsaʊ/, GI-nee-bi-SOW)...",What is the estimated population of Guinea-Bis...,1704000
4,"Guinea-Bissau (i/ˈɡɪni bɪˈsaʊ/, GI-nee-bi-SOW)...",How many kilometers does Guinea-Bissau cover?,36125


In [42]:
data=data1

In [43]:
import torch
class QA_Dataset(Dataset):
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer
        self.q_len = q_len
        self.t_len = t_len
        self.data = dataframe
        self.questions = self.data["question"]
        self.context = self.data["context"]
        self.answer = self.data['answer']

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.context[idx]
        answer = self.answer[idx]

        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length",
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        labels[labels == 0] = -100

        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }

In [44]:
# Dataloader

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [50]:
DEVICE=0

In [52]:
train_loss = 0
val_loss = 0
train_batch_count = 0
val_batch_count = 0

for epoch in range(2):
    MODEL.train()
    for batch in tqdm(train_loader, desc="Training batches"):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["decoder_attention_mask"]

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        train_loss += outputs.loss.item()
        train_batch_count += 1




Training batches: 100%|██████████| 2/2 [00:38<00:00, 19.34s/it]
Training batches: 100%|██████████| 2/2 [00:21<00:00, 10.86s/it]


In [53]:
 #Evaluation
 for epoch in range(2):
    MODEL.eval()
    for batch in tqdm(val_loader, desc="Validation batches"):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["decoder_attention_mask"]

        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        OPTIMIZER.zero_grad()
        outputs.loss.backward()
        OPTIMIZER.step()
        val_loss += outputs.loss.item()
        val_batch_count += 1

    print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

Validation batches: 100%|██████████| 1/1 [00:11<00:00, 11.40s/it]


1/2 -> Train loss: 5.61785352230072	Validation loss: 4.337640285491943


Validation batches: 100%|██████████| 1/1 [00:07<00:00,  7.18s/it]

2/2 -> Train loss: 5.61785352230072	Validation loss: 4.182759761810303





In [54]:
MODEL.save_pretrained("qa_model")
TOKENIZER.save_pretrained("qa_tokenizer")

# Saved files
"""('qa_tokenizer/tokenizer_config.json',
 'qa_tokenizer/special_tokens_map.json',
 'qa_tokenizer/spiece.model',
'qa_tokenizer/added_tokens.json',
'qa_tokenizer/tokenizer.json')"""

"('qa_tokenizer/tokenizer_config.json',\n 'qa_tokenizer/special_tokens_map.json',\n 'qa_tokenizer/spiece.model',\n'qa_tokenizer/added_tokens.json',\n'qa_tokenizer/tokenizer.json')"

In [59]:
def predict_answer(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)

    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)

    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer],
                            references=[ref_answer])

        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer,
            "Predicted Answer: ": predicted_answer,
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [60]:
idx=0
context=data.iloc[idx]['context']
question=data.iloc[idx]['question']
answer=data.iloc[idx]['answer']
predict_answer(context,question,answer)

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Context: 
 Guinea-Bissau (i/ˈɡɪni bɪˈsaʊ/, GI-nee-bi-SOW), officially the Republic of Guinea-Bissau (Portuguese: República da Guiné-Bissau, pronounced: [ʁeˈpublikɐ dɐ ɡiˈnɛ biˈsaw]), is a country in West Africa. It covers 36,125 square kilometres (13,948 sq mi) with an estimated population of 1,704,000.


Question: 
 What is the official name for Guinea-Bissau?


{'Reference Answer: ': 'the Republic of Guinea-Bissau',
 'Predicted Answer: ': '(Portuguese: epublik d ',
 'BLEU Score: ': {'google_bleu': 0.0}}

In [62]:
idx=3
context=data.iloc[idx]['context']
question=data.iloc[idx]['question']
answer=data.iloc[idx]['answer']
predict_answer(context,question,answer)

Context: 
 Guinea-Bissau (i/ˈɡɪni bɪˈsaʊ/, GI-nee-bi-SOW), officially the Republic of Guinea-Bissau (Portuguese: República da Guiné-Bissau, pronounced: [ʁeˈpublikɐ dɐ ɡiˈnɛ biˈsaw]), is a country in West Africa. It covers 36,125 square kilometres (13,948 sq mi) with an estimated population of 1,704,000.


Question: 
 What is the estimated population of Guinea-Bissau?


{'Reference Answer: ': '1,704,000',
 'Predicted Answer: ': '(Portuguese: epublik d ',
 'BLEU Score: ': {'google_bleu': 0.0}}