In [1]:
LEARNING_RATE=5e-5
EPOCHS=3
BATCH_SIZE=16

MAX_INPUT_LENGTH=400
MAX_CONTEXT_LENGTH=350

NO_ANSWER = (0, 0)

In [2]:
import json

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam

from transformers import BertTokenizer, BertForQuestionAnswering
from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy

#from pprint import pprint
#import textwrap
# Wrap text to 80 characters.
#wrapper = textwrap.TextWrapper(width=80) 

In [3]:
CPU_DEVICE = 'cpu'
CUDA_DEVICE = 'cuda'
DEVICE = CUDA_DEVICE if torch.cuda.is_available() else CPU_DEVICE

In [4]:
class Question:
    def __init__(self, text, answer, context, isImpossible = False) -> None:
        self.text = text
        self.context = context
        if isImpossible:
            self.answer = NO_ANSWER
        else:
            endCharIndex = answer['answer_start'] + len(answer['text']) - 1
            whitespacesBeforeAnswer = 0
            whitespacesInAnswer = 0
            for i in context.whitespaces:
                if i >= answer['answer_start']:
                    if i < endCharIndex:
                        whitespacesInAnswer += 1
                    else:
                        break
                else:
                    whitespacesBeforeAnswer += 1
            noWhitespaceStart = answer['answer_start'] - whitespacesBeforeAnswer
            noWhitespaceEnd = noWhitespaceStart + len(answer['text']) - 1 - whitespacesInAnswer
            self.answer = context.getAnswerTokenIndexes(noWhitespaceStart, noWhitespaceEnd)

    def __repr__(self) -> str:
        if self.answer == NO_ANSWER:            
            answer = ' '.join(self.context.tokens[self.answer[0]:self.answer[1]+1])
        else:
            answer = ''
        return str({
            "text": self.text,
            "answer_start": self.answer[0],
            "answer_end": self.answer[1],
            "answer": answer
        })

class QuestionContext:
    def __init__(self, text, tokenizer) -> None:
        self.text = text
        self.tokenIds = tokenizer(text, truncation=True, max_length=MAX_CONTEXT_LENGTH)['input_ids']
        self.tokens = tokenizer.convert_ids_to_tokens(self.tokenIds)
        whitespaces = []
        for i, c in enumerate(text):
            if c == ' ':
                whitespaces.append(i)
        
        self.whitespaces = tuple(whitespaces)

    def getAnswerTokenIndexes(self, startCharIndex, endCharIndex):
        answerStart = -1
        answerEnd = -1
        currChar = 0
        for index, token in enumerate(self.tokens):
            if (index != 0) and (index != len(self.tokens) - 1):
                cleanToken = token.replace('##', '')
                for c in cleanToken:
                    if currChar == startCharIndex:
                        answerStart = index
                    if currChar == endCharIndex:
                        answerEnd = index
                        return (answerStart, answerEnd)
                    currChar += 1
        return NO_ANSWER

In [5]:
questions = []
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

with open('../input/squad-20/train-v2.0.json') as samplesFile:
    samplesRaw = json.load(samplesFile)['data']
    for group in samplesRaw:
        for paragraph in group['paragraphs']:
            context = QuestionContext(paragraph['context'], tokenizer)
            for qa in paragraph['qas']:
                answer = qa['answers'][0] if not qa['is_impossible'] else None
                questions.append(Question(qa['question'], answer, context, qa['is_impossible']))
                #if not qa['is_impossible']:
                    #questions.append(Question(qa['question'], qa['answers'][0], context, qa['is_impossible']))

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
class QuestionsDataset(torch.utils.data.Dataset):
    def __init__(self, questions) -> None:
        super().__init__()
        self.questions = [q.text for q in questions]
        self.contexts = [q.context.text for q in questions]
        self.answers = [torch.tensor(q.answer) for q in questions]

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, index):
        return self.questions[index], self.contexts[index], self.answers[index]

In [7]:
trainDataset = QuestionsDataset(questions)
trainSetLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=False)
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [8]:
def predictionsF1Score(modelAnswers, trueAnswers):

    def findAnswerF1(modelAnswer, trueAnswer):
        modelSequence = range(modelAnswer[0], modelAnswer[1] + 1 )
        trueSequence = range(trueAnswer[0], trueAnswer[1] + 1 )
        numCommon = len(set(trueSequence).intersection(modelSequence))

        if numCommon == 0:
            return 0
        
        precision = 1.0 * numCommon / len(trueSequence)
        recall = 1.0 * numCommon / len(modelSequence)
        f1 = (2 * precision * recall) / (precision + recall)
        return f1

    totalF1 = 0
    for model, gold in zip(modelAnswers, trueAnswers):
        totalF1 += findAnswerF1(model, gold)

    return totalF1/len(trueAnswers)

def predictionsExactScore(modelAnswers, trueAnswers):
    correct = 0
    for model, true in zip(modelAnswers, trueAnswers):
        correct += int( (model[0] == true[0]) and (model[1] == true[1]) )
    
    return correct/len(trueAnswers)

def getPredictedAnswers(startLogits, endLogits):
    softmaxStart = torch.log_softmax(startLogits, dim = 1)
    _, start = torch.max(softmaxStart, dim = 1)

    softmaxEnd = torch.log_softmax(endLogits, dim = 1)
    _, end = torch.max(softmaxEnd, dim = 1)
    return (start.cpu().detach().numpy(), end.cpu().detach().numpy())

In [9]:
for epoch in range(EPOCHS):
    model.train()
    epochExactBatchScores = []
    epochBatchLosses = []
    epochBatchF1 = []
    for batchQuestions, batchContexts, batchAnswers in trainSetLoader:
        qaPairs = [[question, answer] for question, answer in zip(batchQuestions, batchContexts)]
        tok = tokenizer._batch_encode_plus( qaPairs,
                                            truncation_strategy=TruncationStrategy.ONLY_SECOND,
                                            max_length=MAX_INPUT_LENGTH,
                                            padding_strategy=PaddingStrategy.MAX_LENGTH,
                                            return_tensors="pt")
        inputIds = tok['input_ids'].to(DEVICE)
        segmentIds = tok['token_type_ids'].to(DEVICE)
        attentionMask = tok['attention_mask'].to(DEVICE)
        startPositions = batchAnswers[:, 0].to(DEVICE)
        endPositions = batchAnswers[:, 1].to(DEVICE)
        
        outputs = model(input_ids=inputIds, token_type_ids=segmentIds, attention_mask=attentionMask, start_positions=startPositions, end_positions=endPositions)    
        
        optimizer.zero_grad()

        outputs.loss.backward()

        optimizer.step()

        startPredictions, endPredictions = getPredictedAnswers(outputs.start_logits, outputs.end_logits)
        modelAnswers = np.vstack((startPredictions, endPredictions)).T
        
        epochExactBatchScores.append(predictionsExactScore(modelAnswers, batchAnswers))
        epochBatchLosses.append(outputs.loss.item())
        epochBatchF1.append(predictionsF1Score(modelAnswers, batchAnswers))
    
    print(f"############ Epoch {epoch} ############")
    print(f"Exact: {sum(epochExactBatchScores)/len(epochExactBatchScores):.5f} \
\    F1: {sum(epochBatchF1)/len(epochBatchF1):.5f} Loss: {sum(epochBatchLosses)/len(epochBatchLosses):.5f}")

############ Epoch 0 ############
Exact: 0.31415 \    F1: 0.40685 Loss: 2.55958
############ Epoch 1 ############
Exact: 0.38273 \    F1: 0.55976 Loss: 1.71908
############ Epoch 2 ############
Exact: 0.47538 \    F1: 0.66463 Loss: 1.31263
