In [1]:
from google.colab import drive
import io
import os
from google.colab import drive
import pandas as pd
import numpy as np
import json
import sys

drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/My Drive/')

!cp -r "/content/drive/My Drive/BERT/Data/train-v2.0.json" '/content/'
!cp -r "/content/drive/My Drive/BERT/Data/dev-v2.0.json" '/content/'
!cp -r "/content/drive/My Drive/BERT/Data/train_data.pkl" '/content/'
!cp -r "/content/drive/My Drive/BERT/Data/val_data.pkl" '/content/'



Mounted at /content/drive


In [2]:
collab = 1
if collab == 1:
    train_path = '/content/train-v2.0.json'
    dev_path = '/content/dev-v2.0.json'
else:
    train_path = '../Data/train-v2.0.json'
    dev_path = '../Data/dev-v2.0.json'

with open(train_path) as f:
    raw_train_data = json.load(f)

with open(dev_path) as f:
    raw_dev_data = json.load(f)



In [3]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering
from tokenizers import BertWordPieceTokenizer

slowTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
save_path = '../Data/bert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
slowTokenizer.save_pretrained(save_path)

# loading the tokenizer from the saved file
tokenizer = BertWordPieceTokenizer('../Data/bert_base_uncased/vocab.txt', lowercase=True)
maxLength = 384

In [4]:
class SQUADExample:
    """
    A single training/test example for the Squad dataset, as loaded from disk.
    """
    def __init__(self, context, question, basic_answer, more_answers, startingIdx):
        self.context = context
        self.question = question
        self.basic_answer = basic_answer
        self.more_answers = more_answers
        self.startingIdx = startingIdx
        self.endingIdx = None
        self.attention_mask = None
        self.input_ids = None
        self.tokenTypeIds = None
        self.offSets = None
        self.validExample = True
        self.startIdxtoken = startingIdx
        self.endIndextoken = None

    def preProcessing(self):
        newContext = str(self.context).lower().split()
        self.context = ' '.join(newContext)
        newQuestion = str(self.question).lower().split()
        self.question = ' '.join(newQuestion)
        contextTokens = tokenizer.encode(self.context)
        if self.basic_answer is not None :
            # if we have answer
            self.basic_answer = ' '.join(str(self.basic_answer).lower().split())
            self.endingIdx = self.startingIdx + len(self.basic_answer)
            if self.endingIdx >= len(self.context):
                self.validExample = False
                return

            # iterate from start to end to find the characters of context
            isPartOfAnswer = [0] * len(self.context)
            for idx in range(self.startingIdx, self.endingIdx):
                isPartOfAnswer[idx] = 1

            answerIdToken = []
            for idx, (start, end) in enumerate(contextTokens.offsets):
                if sum(isPartOfAnswer[start:end]) > 0:
                    answerIdToken.append(idx)
            # data to predict the start and end index of the answer
            if len(answerIdToken) == 0:
                self.validExample = False
                return
            self.startIdxtoken = answerIdToken[0]
            self.endIndextoken = answerIdToken[-1]

        self.offSets = contextTokens.offsets
        questionTokenizer = tokenizer.encode(self.question)
        self.input_ids = contextTokens.ids + questionTokenizer.ids[1:]
        self.attention_mask = [1] * len(self.input_ids)
        self.tokenTypeIds = [0] * len(contextTokens.ids) + [1] * len(questionTokenizer.ids[1:])

        # padding fixing
        paddingLength = maxLength - len(self.input_ids)
        if paddingLength > 0:
            self.input_ids = self.input_ids + ([0] * paddingLength)
            self.attention_mask = self.attention_mask + ([0] * paddingLength)
            self.tokenTypeIds = self.tokenTypeIds + ([0] * paddingLength)
        elif paddingLength < 0:
            self.validExample = False
            return

In [5]:
def createSquadExamples(raw_data):
    squadExamples = []
    for item in raw_data['data']:
        for para in item['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                question = qa['question']
                basic_answer = None
                more_answers = []
                startingIdx = None
                if qa['is_impossible']:
                    basic_answer = None
                else:
                    basic_answer = qa['answers'][0]['text']
                    startingIdx = qa['answers'][0]['answer_start']
                squadEg = SQUADExample(context, question, basic_answer, more_answers, startingIdx)
                squadEg.preProcessing()
                squadExamples.append(squadEg)
    return squadExamples

In [6]:
def createInputsTargets(squad_example):
    datasetDict = {}
    for item in squad_example:
        if item.validExample:
            for key in ['input_ids', 'attention_mask', 'tokenTypeIds', 'startIdxtoken', 'endIndextoken']:
                if key not in datasetDict:
                    datasetDict[key] = []
                datasetDict[key].append(item.__dict__[key])

    for key in datasetDict:
        datasetDict[key] = np.array(datasetDict[key], dtype=np.float16)

    x = [datasetDict['input_ids'], datasetDict['attention_mask'], datasetDict['tokenTypeIds']]
    y = [datasetDict['startIdxtoken'], datasetDict['endIndextoken']]
    return x, y


In [7]:
# train_data = createSquadExamples(raw_train_data)
# val_data = createSquadExamples(raw_dev_data)

In [8]:
# pickle the data
# import pickle
# with open('../Data/train_data.pkl', 'wb') as f:
#     pickle.dump(train_data, f)

# with open('../Data/val_data.pkl', 'wb') as f:
#     pickle.dump(val_data, f)

In [9]:
# load the data
import pickle
with open('../content/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('../content/val_data.pkl', 'rb') as f:
    val_data = pickle.load(f)

In [10]:
# store the data in csv format
train_df = pd.DataFrame([vars(f) for f in train_data])
val_df = pd.DataFrame([vars(f) for f in val_data])

# train_df.to_csv('../Data/train.csv', index=False)
# val_df.to_csv('../Data/val.csv', index=False)

# # load the data from csv format
# train_df = pd.read_csv('../Data/train.csv')
# val_df = pd.read_csv('../Data/val.csv')

In [11]:
# print 10 random rows from the training data
train_df[['context', 'question', 'basic_answer', 'startIdxtoken', 'endIndextoken']].sample(10)

Unnamed: 0,context,question,basic_answer,startIdxtoken,endIndextoken
63671,the yale provost's office has launched several...,who was appointed acting president of yale in ...,hanna holborn gray,20.0,24.0
40659,the ammunition and shells fired by these weapo...,smaller .50 caliber and 8 millimeter guns have...,smallest mounts,109.0,110.0
89683,the alaska railroad was one of the last railro...,the arr was one of the last railroads in the u...,cabooses,15.0,17.0
73747,"the knights of columbus, the world's largest c...",in modern day snet operates as what in new haven?,subsidiary of at&t,315.0,319.0
85431,"with this contribution of von neumann, the axi...",what was the central theme of godel's announce...,they cannot prove every truth which is express...,85.0,96.0
75669,"in january 1871, george jackson mivart's on th...",what was chapter vii entitled?,miscellaneous objections,87.0,88.0
109559,the final stage of database design is to make ...,what decisions are optional in the last stage ...,,,
62494,data compression can be viewed as a special ca...,what can be classified as data differencing wi...,,,
10711,le: life expectancy at birth mys: mean years o...,what does mis stand for?,,,
93368,classical statistical mechanics requires the e...,what is not required to exist in modern statis...,,,


In [12]:
X_train , y_train = createInputsTargets(train_data)
X_val , y_val = createInputsTargets(val_data)

In [13]:
DOC_STRIDE = 64
MAX_SEQ_LENGTH = 128
MAX_QUERY_LENGTH = 32
BATCH_SIZE = 16

In [14]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

trainData = TensorDataset(torch.tensor(X_train[0], dtype=torch.int64),
                           torch.tensor(X_train[1], dtype=torch.float),
                           torch.tensor(X_train[2], dtype=torch.int64),
                           torch.tensor(y_train[0], dtype=torch.int64),
                           torch.tensor(y_train[1], dtype=torch.int64))

train_sampler = RandomSampler(trainData)
train_dataloader = DataLoader(
    trainData, sampler=train_sampler, batch_size=BATCH_SIZE)

train_on_gpu = torch.cuda.is_available()
if train_on_gpu:
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print("Training on " + str(device))

Training on cuda


In [15]:
validData = TensorDataset(torch.tensor(X_val[0], dtype=torch.int64),
                            torch.tensor(X_val[1], dtype=torch.float),
                            torch.tensor(X_val[2], dtype=torch.int64),
                            torch.tensor(y_val[0], dtype=torch.int64),
                            torch.tensor(y_val[1], dtype=torch.int64))
valid_sampler = SequentialSampler(validData)
valid_dataloader = DataLoader(
    validData, sampler=valid_sampler, batch_size=BATCH_SIZE)


In [16]:
print(train_dataloader.dataset.tensors[0])

tensor([[  101, 20768, 21024,  ...,     0,     0,     0],
        [  101, 20768, 21024,  ...,     0,     0,     0],
        [  101, 20768, 21024,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  2744,  ...,     0,     0,     0],
        [  101,  1996,  2744,  ...,     0,     0,     0],
        [  101,  1996,  2744,  ...,     0,     0,     0]])


In [17]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)
param_optimizer = list(model.named_parameters())

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = torch.optim.Adam(
    lr=1e-5, betas=(0.9, 0.98), eps=1e-9, params=optimizer_grouped_parameters)


In [19]:
import string
import re

def normalizeText(text):
    if text is None or len(text)==0:
      return ""
    text = text.lower()
    text = ''.join(ch for ch in text if ch not in set(string.punctuation))
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) # remove a, an, the
    text = re.sub(regex, ' ', text)
    text = " ".join(text.split())
    return text

In [20]:
from tqdm import tqdm, trange
def train(model, train_dataloader, val_data, validation_dataloader, optimizer, epochs=2, max_grad_norm=1.0):
    model.train()
    for _ in trange(epochs, desc='Epoch'):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'start_positions': batch[3], 'end_positions': batch[4]}
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs[0]
            loss.backward()
            tr_loss += loss.item()
            nb_tr_steps += 1
            torch.nn.utils.clip_grad_norm_( parameters=model.parameters(), max_norm=max_grad_norm)
            optimizer.step()

            model.zero_grad()
            if(step % 100 == 0):
              print("Batch loss : {}".format(tr_loss/nb_tr_steps))
        print("Train loss: {}".format(tr_loss/nb_tr_steps))
        validate(model, val_data, validation_dataloader)

    print("Training complete")

def validate(model, val_data, validation_dataloader):
    model.eval()
    currentQuery = 0
    correctAns = 0
    validExamples = [x for x in val_data if x.validExample]
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, start_positions, end_positions = batch
        with torch.no_grad():
            start_scores, end_scores = model(input_ids, token_type_ids=segment_ids,
                                             attention_mask=input_mask,return_dict=False)

            pred_start, pred_end = start_scores.detach().cpu().numpy(), end_scores.detach().cpu().numpy()

        for idx, (start,end) in enumerate(zip(pred_start,pred_end)):
            squadEg = validExamples[currentQuery]
            currentQuery +=1
            offsets = squadEg.offSets
            startIdx = np.argmax(start)
            endIdx = np.argmax(end)
            if startIdx >= len(offsets):
                continue
            predCharStart = offsets[startIdx][0]
            if endIdx < len(offsets):
                predCharEnd = offsets[endIdx][1]
                predAnswer = squadEg.context[predCharStart:predCharEnd]
            else:
                predAnswer = squadEg.context[predCharStart:]
            if(predAnswer==None):
              continue
            normalizedPredAnswer = normalizeText(predAnswer)
            normalizedTrueAnswer = [normalizeText(x)
                                    for x in squadEg.more_answers]
            normalizedTrueAnswer.append(normalizeText(squadEg.basic_answer))
            if normalizedPredAnswer in normalizedTrueAnswer:
                correctAns += 1
            if(currentQuery + idx) % 50 == 0:
              print("Validated {}/{} examples".format(currentQuery+idx+1, len(validExamples)))
    acc = correctAns / len(validExamples)
    print("Validation Accuracy: {}".format(acc))
    return acc

In [21]:
train(model, train_dataloader, val_data, valid_dataloader, optimizer, epochs=1, max_grad_norm=1.0)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Batch loss : 6.155097961425781
Batch loss : 4.280938927489932
Batch loss : 3.805304542702822
Batch loss : 3.523900489870496
Batch loss : 3.34717895384144
Batch loss : 3.2149320372088463
Batch loss : 3.127440221496112
Batch loss : 3.0497878485501406
Batch loss : 2.9870957743362543
Batch loss : 2.9295461486103003
Batch loss : 2.884565957538136
Batch loss : 2.8385461699843515
Batch loss : 2.7945155470099277
Batch loss : 2.7671483360190834
Batch loss : 2.7400712230050672
Batch loss : 2.7100623318705854
Batch loss : 2.6832910738908673
Batch loss : 2.6563685961431225
Batch loss : 2.6343041676272954
Batch loss : 2.6135999516773576
Batch loss : 2.5959379107638756
Batch loss : 2.5757736853904123
Batch loss : 2.5580772478328084
Batch loss : 2.542476883404983
Batch loss : 2.5241931669417146
Batch loss : 2.5081157502723856
Batch loss : 2.4937798252292707
Batch loss : 2.481288230096972
Batch loss : 2.4660065784832272
Batch loss : 2.4537941356725996
Batch loss : 2.4396893338178325
Batch loss : 2.427

Epoch: 100%|██████████| 1/1 [2:37:22<00:00, 9442.21s/it]

Validation Accuracy: 0.4998281786941581
Training complete





In [30]:
model.save_pretrained('/content/drive/My Drive/BERT/model')
# tokenizer.save_pretrained('/content/drive/My Drive/BERT/tokenizer')

In [28]:
# # ============================================ TESTING ==========================================================
data = {"data":
        [
            {"title": "Tesla's Biography",
             "paragraphs": [
                 {
                     "context": "Nikola Tesla was a Serbian-American inventor, electrical engineer, mechanical engineer, "
                     "and futurist best known for his contributions to the design of the modern alternating "
                     "current (AC) electricity supply system. Born and raised in the Austrian Empire,Tesla "
                     "studied engineering and physics in the 1870s without receiving a degree. In 1884, he "
                     "moved to United States. In 1887, Tesla developed an induction motor that ran on alternating "
                     "current (AC), a power system format that was rapidly expanding in Europe and the United "
                     "States because of its advantages in long-distance, high-voltage transmission. Tesla became "
                     "a vegetarian in his later years, living on only milk, bread, honey, and vegetable juices. "
                     "On 7 January 1943, at the age of 86, Tesla died alone in Room 3327 of the Hotel New Yorker. "
                     "Tesla wrote a number of books and articles for magazines and journals. Among his books are "
                     "My Inventions: The Autobiography of Nikola Tesla, compiled and edited by Ben Johnston "
                     "in 1983 from a series of 1919 magazine articles by Tesla which were republished in 1977. "
                     "Tesla's legacy has endured in books, films, radio, TV, music, live theater, comics, and "
                     "video games. The impact of the technologies invented or envisioned by Tesla is a recurring "
                     "theme in several types of science fiction. ",
                     "qas": [
                         {"question": "When did Tesla becοme a vegetarian?",
                          "id": "Q1",
                          "answers": "",
                          "is_impossible": "False",
                          },
                         {"question": "When did Tesla move to United States ?",
                          "id": "Q2",
                          "answers": "",
                          "is_impossible": "False",
                          },
                         {"question": "What year did Tesla die?",
                          "id": "Q3",
                          "answers": "",
                          "is_impossible": "False",
                          },
                         {"question": "Who edited the book My Inventions: The Autobiography of Nikola Tesla?",
                          "id": "Q4",
                          "answers": "",
                          "is_impossible": "False",
                          },
                         {"question": "In what age did Tesla died?",
                          "id": "Q5",
                          "answers": "",
                          "is_impossible": "False",
                          },
                         {"question": "Who developed an induction motor?",
                          "id": "Q6",
                          "answers": "",
                          "is_impossible": "False",
                          },
                         {"question": "Where Tesla was born?",
                          "id": "Q7",
                          "answers": "",
                          "is_impossible": "False",
                          },
                         {"question": "What did Tesla study?",
                          "id": "Q8",
                          "answers": "",
                          "is_impossible": "False",
                          },
                     ]}]}]}

model.eval()
test_samples = createSquadExamples(data)
x_test, _ = createInputsTargets(test_samples)
pred_start, pred_end = model(torch.tensor(x_test[0], dtype=torch.int64, device=device),
                             torch.tensor(
                                 x_test[1], dtype=torch.float, device=device),
                             torch.tensor(x_test[2], dtype=torch.int64, device=device), return_dict=False)
pred_start, pred_end = pred_start.detach().cpu(
).numpy(), pred_end.detach().cpu().numpy()
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    test_sample = test_samples[idx]
    offsets = test_sample.offSets
    start = np.argmax(start)
    end = np.argmax(end)
    pred_ans = None
    if start >= len(offsets):
        continue
    pred_char_start = offsets[start][0]
    if end < len(offsets):
        pred_ans = test_sample.context[pred_char_start:offsets[end][1]]
    else:
        pred_ans = test_sample.context[pred_char_start:]
    print("Q: " + test_sample.question)
    print("A: " + pred_ans)
    print("----------------------------------------\n")


Q: when did tesla becοme a vegetarian?
A: his later years
----------------------------------------

Q: when did tesla move to united states ?
A: 1884
----------------------------------------

Q: what year did tesla die?
A: 1884, he moved to united states. in 1887, tesla developed an induction motor that ran on alternating current (ac), a power system format that was rapidly expanding in europe and the united states because of its advantages in long-distance, high-voltage transmission. tesla became a vegetarian in his later years, living on only milk, bread, honey, and vegetable juices. on 7 january 1943
----------------------------------------

Q: who edited the book my inventions: the autobiography of nikola tesla?
A: ben johnston
----------------------------------------

Q: in what age did tesla died?
A: 86
----------------------------------------

Q: who developed an induction motor?
A: tesla
----------------------------------------

Q: where tesla was born?
A: austrian empire
-----