# BERT_Q&A Task

A deep neural language model (BERT) and fine-tuned it to deal with the task of Q&A with PyTorch.

In [None]:
# verify GPU availability
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# install huggingface libraries
!pip install pytorch-pretrained-bert pytorch-nlp pytorch_transformers



In [None]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from pytorch_pretrained_bert import BertForQuestionAnswering
from tqdm import tqdm, trange
import pandas as pd
import io
import os
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

from torch.optim import AdamW

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [None]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
!ls /drive/My\ Drive/squad

7_17		  checkpoint-1000  checkpoint-final	   predictions.json
7_19		  checkpoint-2000  dev-v2.0.json	   train-v2.0.json
cache_train	  checkpoint-3000  nbest_predictions.json
cache_validation  checkpoint-4000  null_odds.json


In [None]:
import sys
sys.path.append('/drive/My Drive/squad/')

In [None]:
!wget 'https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad.py'
!wget 'https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad_evaluate.py'

from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
                         RawResultExtended, write_predictions_extended)
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad, plot_pr_curve

--2021-07-30 02:41:25--  https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41529 (41K) [text/plain]
Saving to: ‘utils_squad.py.2’


2021-07-30 02:41:25 (11.7 MB/s) - ‘utils_squad.py.2’ saved [41529/41529]

--2021-07-30 02:41:25--  https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad_evaluate.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12493 (12K) [text/plain]
Saving to: ‘utils_squad_evaluate.py.2

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
input_file = '/drive/My Drive/squad/train-v2.0.json'
examples = read_squad_examples(input_file=input_file,
                                is_training=True,
                                version_2_with_negative=True)

In [None]:
examples[:5]


[qas_id: 56be85543aeaaa14008c9063, question_text: When did Beyonce start becoming popular?, doc_tokens: [Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".], start_position: 39, end_position: 42,
 qas_id: 56be85543aeaaa14008c9065, question_text: What areas did Beyonce compete in when she was growing up?, doc_tokens: [Beyoncé Giselle Knowles-Carter (/biːˈ

In [None]:
train_data = pd.DataFrame.from_records([vars(example) for example in examples])
train_data.head()

Unnamed: 0,qas_id,question_text,doc_tokens,orig_answer_text,start_position,end_position,is_impossible
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",in the late 1990s,39,42,False
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",singing and dancing,28,30,False
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",2003,82,82,False
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...","Houston, Texas",22,23,False
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",late 1990s,41,42,False


In [None]:
sample = train_data.sample(frac=1).head(1)
context = sample.doc_tokens.values
train_data[train_data.doc_tokens.values==context]

Unnamed: 0,qas_id,question_text,doc_tokens,orig_answer_text,start_position,end_position,is_impossible
103678,572f498104bcaa1900d76817,What was a disadvantage of DC system?,"[An, early, advantage, of, AC, is, that, the, ...",power-wasting resistors,8,9,False
103679,572f498104bcaa1900d76818,How can different range of voltages be supplie...,"[An, early, advantage, of, AC, is, that, the, ...",multiple taps on the transformer,24,28,False
103680,572f498104bcaa1900d76819,What taps can provide lighting supply?,"[An, early, advantage, of, AC, is, that, the, ...",low-voltage transformer windings,36,38,False
103681,572f498104bcaa1900d7681a,What will AC/DC motor be replaced with?,"[An, early, advantage, of, AC, is, that, the, ...",three-phase induction motor,69,71,False
103682,572f498104bcaa1900d7681b,What is the main advantage of an induction motor?,"[An, early, advantage, of, AC, is, that, the, ...",can run equally well on DC or AC of any frequency,93,103,False
103683,5acd7ee207355d001abf4476,What was an early disadvantage of AC?,"[An, early, advantage, of, AC, is, that, the, ...",,-1,-1,True
103684,5acd7ee207355d001abf4477,One taps on the transformer can supply what?,"[An, early, advantage, of, AC, is, that, the, ...",,-1,-1,True
103685,5acd7ee207355d001abf4478,The development of low power semiconductors ca...,"[An, early, advantage, of, AC, is, that, the, ...",,-1,-1,True
103686,5acd7ee207355d001abf4479,No modern electric locomotives are designed to...,"[An, early, advantage, of, AC, is, that, the, ...",,-1,-1,True
103687,5acd7ee207355d001abf447a,Combined low-voltage transformer windings supp...,"[An, early, advantage, of, AC, is, that, the, ...",,-1,-1,True


In [None]:
import random
def print_squad_sample(train_data, line_length=14, separator_length=120):
  sample = train_data.sample(frac=1).head(1)
  context = sample.doc_tokens.values
  print('='*separator_length)
  print('CONTEXT: ')
  print('='*separator_length)
  lines = [' '.join(context[0][idx:idx+line_length]) for idx in range(0, len(context[0]), line_length)]
  for l in lines:
      print(l)
  print('='*separator_length)
  questions = train_data[train_data.doc_tokens.values==context]
  print('QUESTION:', ' '*(3*separator_length//4), 'ANSWER:')
  for idx, row in questions.iterrows():
    question = row.question_text
    answer = row.orig_answer_text
    print(question, ' '*(3*separator_length//4-len(question)+9), (answer if answer else 'No awnser found'))

In [None]:
print_squad_sample(train_data)

CONTEXT: 
Italian unification was the political and social movement that annexed different states of the
Italian peninsula into the single state of Italy in the 19th century. There is
a lack of consensus on the exact dates for the beginning and the end
of this period, but many scholars agree that the process began with the end
of Napoleonic rule and the Congress of Vienna in 1815, and approximately ended with
the Franco-Prussian War in 1871, though the last città irredente did not join the
Kingdom of Italy until after World War I.
QUESTION:                                                                                            ANSWER:
What is Italian Unification?                                                                         social movement that annexed different states of the Italian peninsula into the single state of Italy
When did the Italian Unification occur?                                                              in the 19th century
In what year do most  Scholars

In [None]:

train_data['paragraph_len'] = train_data['doc_tokens'].apply(len)
train_data['question_len'] = train_data['question_text'].apply(len)
train_data.sample(frac=1).head(5)

Unnamed: 0,qas_id,question_text,doc_tokens,orig_answer_text,start_position,end_position,is_impossible,paragraph_len,question_len
108606,5a2d5fa3f28ef0001a526506,Where is the headquarters for the League of Am...,"[Santa, Monica, has, a, bike, action, plan, an...",,-1,-1,True,86,64
107689,572f425b04bcaa1900d767ea,How are antennas oriented when arranged with s...,"[For, instance,, a, phased, array, consists, o...",parallel,26,26,False,98,62
65085,5acfce2e77cf76001a6860d4,How many works displayed at The Salon de la Se...,"[The, Section, d'Or,, also, known, as, Groupe,...",,-1,-1,True,115,105
130017,5735ca406c16ec1900b927df,What type of religion is Kirant Mundhum?,"[Kirant, Mundhum, is, one, of, the, indigenous...",animistic,7,7,False,84,40
2182,5a8d914edf8bba001a0f9b0b,Through what can Link's verbalizations be disc...,"[There, is, very, little, voice, acting, in, t...",,-1,-1,True,99,52


In [None]:
max_seq_length = 256
print("Percentage of context's less than max_seq_length = %s%%" % (len([l for l in train_data['paragraph_len'] if l <= max_seq_length])/len(train_data) * 100))

Percentage of context's less than max_seq_length = 98.19289589392184%


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
doc_stride = 128
max_seq_length = 256
max_query_length = 64
# batch size of 64 if RAM available.
batch_size = 32

In [None]:
cached_features_file = '/drive/My Drive/squad/cache_train'

In [None]:
if not os.path.exists(cached_features_file):
  features = convert_examples_to_features(examples=examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=max_seq_length,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=True)
  torch.save(features, cached_features_file)
else:
  features = torch.load(cached_features_file)

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                        all_start_positions, all_end_positions,
                        all_cls_index, all_p_mask)

In [None]:
train_sampler = RandomSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=batch_size, drop_last=True)

In [None]:
import glob
checkpoints = sorted(glob.glob('/drive/My Drive/squad/checkpoint*-[0-9]*'))

In [None]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [None]:
if len(checkpoints) > 0:
  global_step = checkpoints[-1].split('-')[-1]
  ckpt_name = '/drive/My Drive/squad/checkpoint-{}'.format(global_step)
  print("Loading model from checkpoint %s" % ckpt_name)
  model = BertForQuestionAnswering.from_pretrained(ckpt_name)
  train_loss_set_ckpt = torch.load(ckpt_name + '/training_loss.pt')
  train_loss_set = to_list(train_loss_set_ckpt)
  tr_loss = train_loss_set[-1]
else:
  global_step = 0
  train_loss_set = []
  tr_loss = 0.0
  model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

model.cuda()

Loading model from checkpoint /drive/My Drive/squad/checkpoint-4000


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
        

In [None]:
param_optimizer = list(model.named_parameters())
print(param_optimizer[-2])
print(param_optimizer[-1])

('qa_outputs.weight', Parameter containing:
tensor([[-0.0070,  0.0070, -0.0011,  ...,  0.0011,  0.0258,  0.0148],
        [-0.0079,  0.0184, -0.0486,  ...,  0.0001,  0.0208, -0.0173]],
       device='cuda:0', requires_grad=True))
('qa_outputs.bias', Parameter containing:
tensor([0.0051, 0.0057], device='cuda:0', requires_grad=True))


In [None]:
learning_rate = 3e-5
adam_epsilon=1e-8
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)

In [None]:
num_train_epochs = 3

print("***** Running training *****")
print("  Num examples = %d" % len(dataset))
print("  Num Epochs = %d" % num_train_epochs)
print("  Batch size = %d" % batch_size)
print("  Total optimization steps = %d" % (len(train_dataloader) // num_train_epochs))

model.zero_grad()
train_iterator = trange(num_train_epochs, desc="Epoch")
set_seed()

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
      if step < int(global_step) + 1:
        continue

      model.train()
      batch = tuple(t.to(device) for t in batch)

      inputs = {'input_ids':       batch[0],
                'attention_mask':  batch[1], 
                'token_type_ids':  batch[2],  
                'start_positions': batch[3], 
                'end_positions':   batch[4]}

      outputs = model(**inputs)

      loss = outputs[0]
      train_loss_set.append(loss)
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      tr_loss += loss.item()
      optimizer.step()
      model.zero_grad()
      global_step = int(global_step)
      global_step += 1
    
      if global_step % 1000 == 0:
        print("Train loss: {}".format(tr_loss/global_step))
        output_dir = '/drive/My Drive/squad/checkpoint-{}'.format(global_step)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        torch.save(torch.tensor(train_loss_set), os.path.join(output_dir, 'training_loss.pt'))
        print("Saving model checkpoint to %s" % output_dir)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Iteration:   0%|          | 0/4508 [00:00<?, ?it/s][A

***** Running training *****
  Num examples = 144262
  Num Epochs = 3
  Batch size = 32
  Total optimization steps = 1502



Iteration:   1%|▏         | 58/4508 [00:00<00:13, 329.53it/s][A
Iteration:   4%|▍         | 182/4508 [00:00<00:10, 422.43it/s][A
Iteration:   7%|▋         | 312/4508 [00:00<00:07, 529.65it/s][A
Iteration:  10%|▉         | 440/4508 [00:00<00:06, 641.63it/s][A
Iteration:  12%|█▏        | 563/4508 [00:00<00:05, 748.00it/s][A
Iteration:  15%|█▌        | 692/4508 [00:00<00:04, 855.46it/s][A
Iteration:  18%|█▊        | 829/4508 [00:00<00:03, 964.07it/s][A
Iteration:  21%|██▏       | 969/4508 [00:00<00:03, 1062.58it/s][A
Iteration:  25%|██▍       | 1107/4508 [00:00<00:02, 1141.31it/s][A
Iteration:  28%|██▊       | 1244/4508 [00:01<00:02, 1201.46it/s][A
Iteration:  31%|███       | 1375/4508 [00:01<00:02, 1215.34it/s][A
Iteration:  33%|███▎      | 1504/4508 [00:01<00:02, 1224.92it/s][A
Iteration:  36%|███▋      | 1636/4508 [00:01<00:02, 1248.38it/s][A
Iteration:  39%|███▉      | 1776/4508 [00:01<00:02, 1287.85it/s][A
Iteration:  42%|████▏     | 1913/4508 [00:01<00:01, 1309.69it/s

IndexError: ignored

In [None]:
output_dir = '/drive/My Drive/squad/checkpoint-final'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)

torch.save(torch.tensor(train_loss_set), os.path.join(output_dir, 'training_loss.pt'))
print("Saving model checkpoint to %s" % output_dir)

In [None]:
# train_loss_set = []
# for tmp in ['1000', '2000', '3000', '4000','5000','6000','7000','8000','9000','final']:
#   #'10000','11000','12000', '13000','14000','15000','16000','17000','18000',
#   train_loss_set_ckpt = torch.load('/drive/My Drive/squad/checkpoint-'+tmp+'/training_loss.pt')
#   train_loss_set_ckpt = to_list(train_loss_set_ckpt)
#   train_loss_set += train_loss_set_ckpt

train_loss_set_ckpt = torch.load('/drive/My Drive/squad/checkpoint-final/training_loss.pt')
train_loss_set = to_list(train_loss_set_ckpt)

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

**Load test dataset**

In [None]:
input_file = '/drive/My Drive/squad/dev-v2.0.json'
val_examples = read_squad_examples(input_file=input_file,
                                is_training=False,
                                version_2_with_negative=True)
doc_stride = 128
max_seq_length = 256
max_query_length = 64
cached_features_file = '/drive/My Drive/squad/cache_validation'

# Cache features for faster loading
if not os.path.exists(cached_features_file):
  features = convert_examples_to_features(examples=val_examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=max_seq_length,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=False)
  torch.save(features, cached_features_file)
else:
  features = torch.load(cached_features_file)

In [None]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                        all_example_index, all_cls_index, all_p_mask)

In [None]:
validation_sampler = SequentialSampler(dataset)
validation_dataloader = DataLoader(dataset, sampler=validation_sampler, batch_size=batch_size, drop_last=True)

**Evaluate test dataset**

In [None]:

def evaluate(model, tokenizer):
  print("***** Running evaluation *****")
  print("  Num examples = %d" % len(dataset))
  print("  Batch size = %d" % batch_size)
  all_results = []
  predict_file = '/drive/My Drive/squad/dev-v2.0.json'
  for batch in tqdm(validation_dataloader, desc="Evaluating", miniters=100, mininterval=5.0):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
      inputs = {'input_ids':      batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2]
                }
      example_indices = batch[3]
      outputs = model(**inputs)

    for i, example_index in enumerate(example_indices):
      eval_feature = features[example_index.item()]
      unique_id = int(eval_feature.unique_id)

      result = RawResult(unique_id    = unique_id,
                         start_logits = to_list(outputs[0][i]),
                         end_logits   = to_list(outputs[1][i]))
      all_results.append(result)

  # Compute predictions
  output_prediction_file = "/drive/My Drive/squad/predictions.json"
  output_nbest_file = "/drive/My Drive/squad/nbest_predictions.json"
  output_null_log_odds_file = "/drive/My Drive/squad/null_odds.json"
  output_dir = "/drive/My Drive/squad/predict_results"

  #return all_results

  write_predictions(val_examples, features, all_results, 10,
                  30, True, output_prediction_file,
                  output_nbest_file, output_null_log_odds_file, False,
                  True, 0.0)

  # Evaluate with the official SQuAD script
  evaluate_options = EVAL_OPTS(data_file=predict_file,
                               pred_file=output_prediction_file,
                               na_prob_file=output_null_log_odds_file,
                               out_image_dir=None)
  results = evaluate_on_squad(evaluate_options)

  print(features)
  print(all_results)
  return results


In [None]:
results = evaluate(model, tokenizer)
