In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 28.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [2]:
LEARNING_RATE=3e-5
EPOCHS=3

In [3]:
import json

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam

from transformers import BertTokenizer, BertForQuestionAnswering
from transformers.tokenization_utils_base import PaddingStrategy

from pprint import pprint
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80) 

In [4]:
CPU_DEVICE = 'cpu'
CUDA_DEVICE = 'cuda'
DEVICE = CUDA_DEVICE if torch.cuda.is_available() else CPU_DEVICE

In [5]:
class Question:
    def __init__(self, text, answer, context, isImpossible = False) -> None:
        self.text = text
        self.context = context
        if isImpossible:
            self.answer = (-1, -1)
        else:
            endCharIndex = answer['answer_start'] + len(answer['text']) - 1
            whitespacesBeforeAnswer = 0
            whitespacesInAnswer = 0
            for i in context.whitespaces:
                if i >= answer['answer_start']:
                    if i < endCharIndex:
                        whitespacesInAnswer += 1
                    else:
                        break
                else:
                    whitespacesBeforeAnswer += 1
            noWhitespaceStart = answer['answer_start'] - whitespacesBeforeAnswer
            noWhitespaceEnd = noWhitespaceStart + len(answer['text']) - 1 - whitespacesInAnswer
            self.answer = context.getAnswerTokenIndexes(noWhitespaceStart, noWhitespaceEnd)

    def __repr__(self) -> str:
        return str({
            "text": self.text,
            "answer_start": self.answer[0],
            "answer_end": self.answer[1],
            "answer": ' '.join(self.context.tokens[self.answer[0]:self.answer[1]+1])
        })

class QuestionContext:
    def __init__(self, text, tokenizer) -> None:
        self.text = text
        self.tokenIds = tokenizer(text)['input_ids']
        self.tokens = tokenizer.convert_ids_to_tokens(self.tokenIds)
        whitespaces = []
        for i, c in enumerate(text):
            if c == ' ':
                whitespaces.append(i)
        
        self.whitespaces = tuple(whitespaces)

    def getAnswerTokenIndexes(self, startCharIndex, endCharIndex):
        answerStart = -1
        answerEnd = -1
        currChar = 0
        for index, token in enumerate(self.tokens):
            if (index != 0) and (index != len(self.tokens) - 1):
                cleanToken = token.replace('##', '')
                for c in cleanToken:
                    if currChar == startCharIndex:
                        answerStart = index
                    if currChar == endCharIndex:
                        answerEnd = index
                        return (answerStart, answerEnd)
                    currChar += 1

In [6]:
questions = []
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

with open('sample.json') as samplesFile:
    samplesRaw = json.load(samplesFile)['data']
    for group in samplesRaw:
        for paragraph in group['paragraphs']:
            context = QuestionContext(paragraph['context'], tokenizer)
            for qa in paragraph['qas']:
                questions.append(Question(qa['question'], qa['answers'][0], context, qa['is_impossible']))
#pprint(questions)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
class QuestionsDataset(torch.utils.data.Dataset):
    def __init__(self, questions) -> None:
        super().__init__()
        self.questions = [q.text for q in questions]
        self.contexts = [q.context.text for q in questions]
        self.answers = [torch.tensor(q.answer) for q in questions]

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, index):
        return self.questions[index], self.contexts[index], self.answers[index]

In [8]:
#ret = tokenizer._batch_encode_plus([['is', 'hi oops'], ['of', 'hello'], ['i am good, thanks', 'haha']], max_length=10, padding_strategy=PaddingStrategy.MAX_LENGTH)
#for id in ret["input_ids"][2]:
#    print(tokenizer.convert_ids_to_tokens(id))

In [9]:
trainDataset = QuestionsDataset(questions)
trainSetLoader = DataLoader(trainDataset, batch_size=10, shuffle=False)
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [10]:
for epoch in range(EPOCHS):
    model.train()
    for batchQuestions, batchContexts, batchAnswers in trainSetLoader:
        qaPairs = [[question, answer] for question, answer in zip(batchQuestions, batchContexts)]
        tok = tokenizer._batch_encode_plus(qaPairs, padding_strategy=PaddingStrategy.LONGEST, return_tensors="pt")
        inputIds = tok['input_ids'].to(DEVICE)
        segmentIds = tok['token_type_ids'].to(DEVICE)
        attentionMask = tok['attention_mask'].to(DEVICE)
        startPositions = batchAnswers[:, 0].to(DEVICE)
        endPositions = batchAnswers[:, 1].to(DEVICE)
        
        outputs = model(input_ids=inputIds, token_type_ids=segmentIds, attention_mask=attentionMask, start_positions=startPositions, end_positions=endPositions)    
        batchLoss = outputs[0]

        optimizer.zero_grad()

        batchLoss.backward()

        optimizer.step()

        startPredictions = outputs.start_logits
        endPredictions = outputs.end_logits

In [11]:
def predictionsF1Score():
    pass

def predictionsExactScore():
    pass