In [None]:
import json
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from transformers import BertModel, BertTokenizerFast, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ExponentialLR
from tqdm import tqdm 
from evaluate import load




In [None]:
def process_train_test_data(file_path):
    with open(file_path,'rb') as file:
        data=json.load(file)
    context=[]
    questions=[]
    answers = []
    no_questions= 0

    for line in data['data']:
        for para in line['paragraphs']:
            context1 = para['context']
            for q_a in para['qas']:
                q = q_a['question']
                #print(q)
                no_questions = no_questions +1
                for ans in q_a['answers']:
                    context.append(context1.lower())
                    questions.append(q.lower())
                    answers.append(ans)



    return context,questions,no_questions,answers

In [None]:
def process_answer_train(idx):
    start = 0
    end = 0
    answer_encoding= tokenizer(train_answers[idx]['text'],  max_length = MAX_LENGTH, truncation=True, padding=True)
    for ans in range( len(train_encoding['input_ids'][idx]) -  len(answer_encoding['input_ids']) ): #len(train_encodings_fast['input_ids'][0])):
        match = True
        for i in range(1,len(answer_encoding['input_ids']) - 1):
            if (answer_encoding['input_ids'][i] != train_encoding['input_ids'][idx][ans + i]):
                match = False
                break
            if match:
                start = ans+1
                end = ans+i+1
                break
    return(start, end)

In [None]:
def process_answer_test(idx):
    start = 0
    end = 0
    answer_encoding= tokenizer(test_answers[idx]['text'],  max_length = MAX_LENGTH, truncation=True, padding=True)
    for ans in range( len(test_encoding['input_ids'][idx]) -  len(answer_encoding['input_ids']) ): #len(train_encodings_fast['input_ids'][0])):
        match = True
        for i in range(1,len(answer_encoding['input_ids']) - 1):
            if (answer_encoding['input_ids'][i] != test_encoding['input_ids'][idx][ans + i]):
                match = False
                break
            if match:
                start = ans+1
                end = ans+i+1
                break
    return(start, end)

In [None]:
def process_start_end_train(train_encoding):
    start_pos=[]
    end_pos=[]
    ctr = 0
    for i in range(len(train_encoding['input_ids'])):
        start,end = process_answer_train(i)
        start_pos.append(start)
        end_pos.append(end)
        if start==0:
            ctr = ctr+1
    return start_pos,end_pos,ctr


def process_start_end_test(test_encoding):
    start_pos=[]
    end_pos=[]
    ctr = 0
    for i in range(len(test_encoding['input_ids'])):
        start,end = process_answer_test(i)
        start_pos.append(start)
        end_pos.append(end)
        if start==0:
            ctr = ctr+1
    return start_pos,end_pos,ctr

In [None]:
class Input(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, index) :
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][index]),
            'token_type_ids': torch.tensor(self.encodings['token_type_ids'][index]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][index]),
            'start_positions': torch.tensor(self.encodings['start_positions'][index]),
            'end_positions': torch.tensor(self.encodings['end_positions'][index])
        }
    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
class BERT_Model(nn.Module):
    def __init__(self):
        super(BERT_Model, self).__init__()
        self.bert = model_bert
        self.drop_out = nn.Dropout(0.1)
        self.l1 = nn.Linear(768 * 2, 768 * 2)
        self.l2 = nn.Linear(768 * 2, 2)
        self.linear_relu_stack = nn.Sequential(
            self.drop_out,
            self.l1,
            nn.LeakyReLU(),
            self.l2 
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        model_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
        hidden_states = model_output[2]
        out = torch.cat((hidden_states[-1], hidden_states[-3]), dim=-1)  # taking Start logits from last BERT layer, End Logits from third to last layer
        logits = self.linear_relu_stack(out)
        
        start_logits, end_logits = logits.split(1, dim=-1)
        
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [None]:
def loss_function(start1, end1, start_pos, end_pos):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start1, start_pos)
    end_loss = loss_fct(end1, end_pos)
    total_loss = (start_loss + end_loss)/2
    return total_loss

In [None]:
def NLLLoss_function(start1, end1, start_pos, end_pos, gamma):
    
    #calculate Probabilities by applying Softmax to the Start and End Logits. Then get 1 - probabilities
    smax = nn.Softmax(dim=1)
    probs_start = smax(start1)
    inv_probs_start = 1 - probs_start
    probs_end = smax(end1)
    inv_probs_end = 1 - probs_end
    
    #get log of probabilities. Note: NLLLoss required log probabilities. This is the Natural Log (Log base e)
    lsmax = nn.LogSoftmax(dim=1)
    log_probs_start = lsmax(start1)
    log_probs_end = lsmax(end1)
    
    nll = nn.NLLLoss()
    
    fl_start = nll(torch.pow(inv_probs_start, gamma)* log_probs_start, start_pos)
    fl_end = nll(torch.pow(inv_probs_end, gamma)*log_probs_end, end_pos)
    
    #return mean of the Loss for the start and end logits
    return ((fl_start + fl_end)/2)

In [None]:
def training(model, loader, epoch):
    model.train()
    optimizer = AdamW(model.parameters(), lr = 2e-5, weight_decay=2e-2)
    schedular = ExponentialLR(optimizer,gamma=0.9)
    total_acc = []
    total_loss = []
    model = model.train()
    losses = []
    acc = []
    ctr = 0
    batch_tracker = 0
    for batch in tqdm(loader, desc = 'Running Epoch '):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        output_start, output_end = model(input_ids=input_ids, 
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
        
        loss = NLLLoss_function(output_start, output_end, start_positions, end_positions,1) 
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        
        start_pred = torch.argmax(output_start, dim=1)
        end_pred = torch.argmax(output_end, dim=1)
            
        acc.append(((start_pred == start_positions).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_positions).sum()/len(end_pred)).item())

        batch_tracker = batch_tracker + 1
        if batch_tracker==250 and epoch==1:
            total_acc.append(sum(acc)/len(acc))
            loss_avg = sum(losses)/len(losses)
            total_loss.append(loss_avg)
            batch_tracker = 0
    schedular.step()
    ret_acc = sum(acc)/len(acc)
    ret_loss = sum(losses)/len(losses)
    
    return(ret_acc, ret_loss)

In [None]:
def evaluate_model(model, dataloader):
    model = model.eval()
    losses = []
    acc = []
    ctr = 0
    answer_list=[]
    with torch.no_grad():
        for batch in tqdm(dataloader, desc = 'Running Evaluation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)
            
            output_start, output_end = model(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
            start_pred = torch.argmax(output_start)
            end_pred = torch.argmax(output_end)
            answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_pred:end_pred]))
            tanswer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_true[0]:end_true[0]]))
            answer_list.append([answer,tanswer])
    return answer_list

In [None]:
def evaluate_wer():
    wer_eval = load("wer")
    epochs = 3
    model.to(device)

    list_wer=[]

    for e in range(epochs):
        train_accuracy, train_loss = training(model,loader_train,e+1)
        print(f"Train Accuracy: {train_accuracy}      Train Loss: {train_loss}")
        answer_list = evaluate_model(model, loader_test)
        pred_answers=[]
        true_answers=[]
        for i in range(len(answer_list)):
            if(len(answer_list[i][0])==0):
                answer_list[i][0]="$"
            if(len(answer_list[i][1])==0):
                answer_list[i][1]="$"
            pred_answers.append(answer_list[i][0])
            true_answers.append(answer_list[i][1])

        wer_score = wer_eval.compute(predictions=pred_answers, references=true_answers)
        list_wer.append(wer_score)
    print(list_wer)



In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
context_train, train_questions, train_no_ques, train_answers = process_train_test_data('spoken_train-v1.1.json')
context_test, test_questions, test_no_ques, test_answers = process_train_test_data('spoken_test-v1.1.json')


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
print(tokenizer)
MAX_LENGTH = 250

train_encoding = tokenizer(train_questions,context_train,max_length=MAX_LENGTH,truncation=True,padding=True)
test_encoding = tokenizer(test_questions,context_test,max_length = MAX_LENGTH,truncation = True,padding=True)


In [None]:
start_position_train,end_position_train,ctr1 = process_start_end_train(train_encoding)

train_encoding.update({'start_positions': start_position_train, 'end_positions': end_position_train})

print(ctr1)

start_position_test,end_position_test,ctr2 = process_start_end_test(test_encoding)

test_encoding.update({'start_positions': start_position_test, 'end_positions': end_position_test})

print(ctr2)

In [None]:

dataset_train = Input(train_encoding)
dataset_test = Input(test_encoding)
loader_train = DataLoader(dataset_train)
loader_test = DataLoader(dataset_test)

In [None]:
model_bert = BertModel.from_pretrained('bert-base-uncased')

model = BERT_Model()

In [None]:
if __name__ == "__main__":
    evaluate_wer()