In [1]:
import torch
torch.cuda.is_available()

True

In [13]:
import warnings
from transformers import logging

warnings.filterwarnings("ignore")
logging.set_verbosity_error()

In [14]:
import csv
import json
import pandas as pd
import numpy as np
import torch

In [15]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertForQuestionAnswering, BertTokenizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.cuda.amp import autocast, GradScaler
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

In [16]:
bs = 8
epochs = 3
lr = 2e-5

In [17]:
class SQuAD_Data(Dataset):
    def __init__(self, file_path):
        self.context_list = []
        self.question_list = []
        self.answer_list = []
        self.start_pos_list = []
        self.end_pos_list = []
        with open(file_path, 'r', encoding='cp1252') as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                self.context_list.append(row[0])
                self.question_list.append(row[1])
                self.answer_list.append(row[2])
                self.start_pos_list.append(int(row[3]))
                self.end_pos_list.append(int(row[4]))

    def __len__(self):
        return len(self.context_list)

    def __getitem__(self, idx):
        return {
            'context': self.context_list[idx],
            'question': self.question_list[idx],
            'answer': self.answer_list[idx],
            'start_pos': self.start_pos_list[idx],
            'end_pos': self.end_pos_list[idx]
        }


In [38]:
train_ds = SQuAD_Data('squad_train_data.csv')
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
ds_size = len(train_ds)
dl_size = len(train_dl)
print(f'Training dataset size: {ds_size}')
print(f'Training loader size: {dl_size}')

test_ds = SQuAD_Data('squad_test_data.csv')
test_dl = DataLoader(test_ds, batch_size=bs, shuffle=True)
ds_test_size = len(test_ds)
dl_test_size = len(test_dl)
print(f'Testing dataset size: {ds_test_size}')
print(f'Testing loader size: {dl_test_size}')

Training dataset size: 37079
Training loader size: 4635
Testing dataset size: 5351
Testing loader size: 669


In [25]:
def train_fn(train_model, data_ld, optim, device, acc_steps):
    train_model.train()
    train_model.to(device)
    t_loss = 0
    scaler_object = GradScaler() 
    batch_count = 0
    for data in data_ld:
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        start_pos = data['start_pos'].to(device)
        end_pos = data['end_pos'].to(device)
        optim.zero_grad()
        with autocast(): 
            outputs = train_model(**inputs, start_positions=start_pos, end_positions=end_pos)
            loss = outputs.loss
        scaler_object.scale(loss).backward()
        batch_count += 1
        if batch_count % acc_steps == 0:
            scaler_object.step(optim)  
            scaler_object.update()  
            optim.zero_grad()   
        loss_val = loss.item()
        if str(loss_val) == 'nan':
            loss_val = 0
        t_loss += loss_val
    if batch_count % acc_steps != 0:
        scaler_object.step(optim)  
        scaler_object.update() 
        optim.zero_grad() 
    return t_loss / len(data_ld)


def test_fn(test_model, data_ld, optim, device):    
    t_loss = 0.0
    test_model.eval()
    test_model.to(device)
    scaler_object = GradScaler() 
    for data in data_ld:
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        start_pos = data['start_pos'].to(device)
        end_pos = data['end_pos'].to(device)
        optim.zero_grad()
        with autocast():  
            outputs = test_model(**inputs, start_positions=start_pos, end_positions=end_pos)
            loss = outputs.loss
        loss_val = loss.item()
        if str(loss_val) == 'nan':
            loss_val = 0
        t_loss += loss_val
    return t_loss / len(data_ld)


In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_length=512)
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
model = model.to(device)
for epoch in range(epochs):
    training_loss = train_fn(model, train_dl, optimizer, device, acc_steps=2)
    testing_loss = test_fn(model, test_dl, optimizer, device)
    print(f'Epoch {epoch+1} , train loss {training_loss}, test loss {testing_loss}')
    scheduler.step(testing_loss)

cuda
Epoch 1 , train loss 5.6710381880402565, test loss 5.692981338500976
Epoch 2 , train loss 5.431648567318916, test loss 5.71663028717041
Epoch 3 , train loss 5.215241692960262, test loss 5.727792015075684


In [35]:
def predicting_answer(model, tokenizer, ctx, ques):
    inputs = tokenizer(ques, ctx, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    input_ids = inputs['input_ids'].squeeze()
    output = model(**inputs)
    start_index = torch.argmax(output.start_logits)
    end_index = torch.argmax(output.end_logits) + 1
    if end_index < start_index:
        start_index, end_index = end_index, start_index
    predicted_answer = tokenizer.decode(input_ids[start_index:end_index])
    return predicted_answer


In [37]:
para = "super bowl fifty was an american football game to determine the champion of the national football league nfl for the twenty fifteen season. the american football conference a f c c champion denver broncos defeated the national football conference n f c c champion carolina panthers twenty four to ten to earn their third super bowl title. the game was played on february seventh twenty sixteen and levis stadium in the san francisco bay area santa clara california. as this was the fiftieth super bowl the league emphasized the golden anniversary with various goldsteins initiatives as well as temporarily suspending the tradition of naming each super bowl game with roman numerals under which they gain would have been known as super bowl l sell that the logo could prominently featured the arabic numerals fifty."
ques = "What does AFC stand for?"
predicted_answer = predicting_answer(model, tokenizer, para, ques)
print(predicted_answer)

american football conference


In [28]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', max_length=512)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
model = model.to(device)
for epoch in range(epochs):
    training_loss = train_fn(model, train_dl, optimizer, device, acc_steps=2)
    testing_loss = test_fn(model, test_dl, optimizer, device)
    print(f'Epoch {epoch+1} , train loss {training_loss}, test loss {testing_loss}')
    scheduler.step(testing_loss)

Epoch 1 , train loss 5.75785543769598, test loss 5.828500843048095
Epoch 2 , train loss 5.645928509533405, test loss 5.828914890289306
Epoch 3 , train loss 5.3587766364216805, test loss 5.79552303314209


In [29]:
def predicting_answer(model, tokenizer, ctx, ques):
    inputs = tokenizer(ques, ctx, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    input_ids = inputs['input_ids'].squeeze()
    output = model(**inputs)
    start_index = torch.argmax(output.start_logits)
    end_index = torch.argmax(output.end_logits) + 1
    if end_index < start_index:
        start_index, end_index = end_index, start_index
    predicted_answer = tokenizer.decode(input_ids[start_index:end_index])
    return predicted_answer


In [33]:
para = "super bowl fifty was an american football game to determine the champion of the national football league nfl for the twenty fifteen season. the american football conference a f c c champion denver broncos defeated the national football conference n f c c champion carolina panthers twenty four to ten to earn their third super bowl title. the game was played on february seventh twenty sixteen and levis stadium in the san francisco bay area santa clara california. as this was the fiftieth super bowl the league emphasized the golden anniversary with various goldsteins initiatives as well as temporarily suspending the tradition of naming each super bowl game with roman numerals under which they gain would have been known as super bowl l sell that the logo could prominently featured the arabic numerals fifty."
ques = "What does AFC stand for?"
predicted_answer = predicting_answer(model, tokenizer, para, ques)
print(predicted_answer)

american football conference
