Installing some dependencies

In [15]:
#!pip install transformers
%cd drive/My\ Drive/NQ\ Challenge
#!pip install jsonlines
!ls

[Errno 2] No such file or directory: 'drive/My Drive/NQ Challenge'
/content/drive/My Drive/NQ Challenge
bert-joint-baseline  run_nq.py		   tiny-dev
bert_model_output    test.ipynb		   v1.0_sample_nq-dev-sample.jsonl
models		     test_smalldata.ipynb  v1.0_sample_nq-train-sample.jsonl
__pycache__	     test_utils.py	   v1.0-simplified_nq-dev-all.jsonl


In [16]:
# !git init
# !git add train.ipynb
#!git add tiny-dev/
#!ls
#!git remote add origin https://github.com/prashass/GoogleNQ_Challenge
# !git config --global user.email "prashass@andrew.cmu.edu"
# !git config --global user.name "Prashasti"
!git commit -m 'initial commit'
!git push -u origin master

On branch master
Changes not staged for commit:
	[31mmodified:   train.ipynb[m

Untracked files:
	[31m__pycache__/[m
	[31mbert-joint-baseline/[m
	[31mbert_model_output/[m
	[31mrun_nq.py[m
	[31mtest.ipynb[m
	[31mtest_smalldata.ipynb[m
	[31mtest_utils.py[m
	[31mv1.0-simplified_nq-dev-all.jsonl[m
	[31mv1.0_sample_nq-dev-sample.jsonl[m
	[31mv1.0_sample_nq-train-sample.jsonl[m

no changes added to commit
fatal: could not read Username for 'https://github.com': No such device or address


Training code begins here

In [3]:
from tqdm import tqdm
import json
import numpy as np
from transformers import BertModel, BertConfig, BertTokenizer, BertForQuestionAnswering, BertPreTrainedModel
from transformers import AutoModel, AutoConfig, AutoTokenizer, AutoModelForQuestionAnswering
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import re

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**DATASET, COLLATOR AND DATALOADER**

In [10]:
class NQDataset(Dataset):
  def __init__(self, id_list):
    self.ids = id_list
  
  def __len__(self):
    return len(self.ids)

  def __getitem__(self, index):
    return self.ids[index]


class Collator(object):
  def __init__(self, data_dict, new_token_dict, tokenizer, max_seq_len, max_ques_len):
    self.data_dict = data_dict
    self.new_token_dict = new_token_dict
    self.tokenizer = tokenizer
    self.max_seq_len = max_seq_len
    self.max_ques_len = max_ques_len

  def get_sample(self, data, candidate_words, candidate_start, candidate_end, len_ques_tokens, annotation_idx=-1, instance_type=None):
    max_ans_len = self.max_seq_len - len_ques_tokens - 3    # 3 for [CLS], [SEP], [SEP]
    
    for i, word in enumerate(candidate_words):
      if re.match(r'<.+>', word):
        if word in self.new_token_dict: 
          candidate_words[i] = self.new_token_dict[word]
        else:
          candidate_words[i] = '<'

    words2tokens_idx = []   # Holds indices of first token of each new word
    candidate_tokens = []

    for i, word in enumerate(candidate_words):
      tokens = self.tokenizer.tokenize(word)
      if len(candidate_tokens) + len(tokens) > max_ans_len:
        break
      words2tokens_idx.append(len(candidate_tokens))
      candidate_tokens.extend(tokens)

    start_idx, end_idx = -1, -1
    if instance_type is 'positive':
      if data['annotations'][annotation_idx]['short_answers']:
        start_pos = data['annotations'][annotation_idx]['short_answers'][0]['start_token']
        end_pos = data['annotations'][annotation_idx]['short_answers'][0]['end_token']
        if start_pos>=candidate_start and end_pos<=candidate_end:
          start_idx = len_ques_tokens + 2 + words2tokens_idx[start_pos-candidate_start] 
          end_idx = len_ques_tokens + 2 + words2tokens_idx[end_pos-candidate_start] 
    
    return start_idx, end_idx, candidate_tokens

  def __call__(self, batch_ids):
    batch_size = len(batch_ids)*2

    batch_input_ids = np.zeros((batch_size, self.max_seq_len), dtype=np.int64)
    batch_token_type_ids = np.ones((batch_size, self.max_seq_len), dtype=np.int64)
    batch_start_labels = np.zeros((batch_size,), dtype=np.int64)
    batch_end_labels = np.zeros((batch_size,), dtype=np.int64)
    batch_class_labels = np.zeros((batch_size,), dtype=np.int64)

    for i, doc_id in enumerate(batch_ids):
      data = self.data_dict[doc_id]
      annotation_idx = data['annotation_idx']

      if data['annotations'][annotation_idx]['long_answer']['candidate_index'] != -1:
        batch_class_labels[i*2] = 1     # If long answer exists, mark the class label as 'LONG ANSWER' (1)
      batch_class_labels[1*2 + 1] = 0   # This is to mark the negative instance of question as 'NO ANSWER' (0)

      question_tokens = self.tokenizer.tokenize(data['question_text'])[:self.max_ques_len]

      # For positive candidate instance
      start_idx, end_idx, answer_tokens = self.get_sample(data, data['positive_text'], data['positive_start'], data['positive_end'], len(question_tokens), data['annotation_idx'], 'positive')
      input_tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + answer_tokens + ['[SEP]']
      input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)
      batch_input_ids[2*i, :len(input_ids)] = input_ids
      SEP_ID = self.tokenizer.convert_tokens_to_ids('[SEP]')
      # to get in BERT format of 0s and 1s for 2 sentence-inputs
      batch_token_type_ids[2*i, :len(input_ids)] = [0 if k<=input_ids.index(SEP_ID) else 1 for k in range(len(input_ids))]

      batch_start_labels[2*i] = start_idx
      batch_end_labels[2*i] = end_idx

      # For negative candidate instance
      start_idx, end_idx, answer_tokens = self.get_sample(data, data['negative_text'], data['negative_start'], data['negative_end'], len(question_tokens), -1, 'negative')
      input_tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + answer_tokens + ['[SEP]']
      input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)
      batch_input_ids[2*i + 1, :len(input_ids)] = input_ids
      SEP_ID = self.tokenizer.convert_tokens_to_ids('[SEP]')
      # to get in BERT format of 0s and 1s for 2 sentence-inputs
      batch_token_type_ids[2*i + 1, :len(input_ids)] = [0 if k<=input_ids.index(SEP_ID) else 1 for k in range(len(input_ids))]

      batch_start_labels[2*i + 1] = start_idx
      batch_end_labels[2*i + 1] = end_idx

    batch_attention_mask = batch_input_ids > 0

    return torch.from_numpy(batch_input_ids), torch.from_numpy(batch_attention_mask), torch.from_numpy(batch_token_type_ids), \
          torch.LongTensor(batch_start_labels), torch.LongTensor(batch_end_labels), torch.LongTensor(batch_class_labels)

**MODEL**

In [11]:
class BertForQuestionAnswering(BertPreTrainedModel):
  def __init__(self, config):
    super(BertForQuestionAnswering, self).__init__(config)
    self.num_labels = config.num_labels
    self.bert = BertModel(config)
    self.qa_outputs = nn.Linear(config.hidden_size, 2)
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    self.init_weights()

  def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
    out = self.bert(input_ids, 
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask)
    
    seq_output = out[0]
    pooled_output = out[1]

    qa_logits = self.qa_outputs(seq_output)
    start_logits, end_logits = qa_logits.split(1, dim=-1)
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)

    pooled_output = self.dropout(pooled_output)
    classifier_logits = self.classifier(pooled_output)

    return start_logits, end_logits, classifier_logits

**Helper Functions for calculating Loss and Accuracy**

In [18]:
def get_class_accuracy(logits, labels):
    predictions = np.argmax(F.softmax(logits,dim=1).cpu().data.numpy(), axis=1)
    return np.float32(np.sum(predictions=labels)) / len(labels), len(labels)

def get_position_accuracy(logits, labels):
    predictions = np.argmax(F.softmax(logits,dim=1).cpu().data.numpy(), axis=1)
    total_num = 0
    sum_correct = 0
    for i in range(len(labels)):
        if labels[i] >= 0:
            total_num += 1
            if predictions[i] == labels[i]:
                sum_correct += 1
    if total_num == 0:
        total_num = 1e-7
    return np.float32(sum_correct) / total_num, total_num

def loss_fn(preds, labels):
    start_preds, end_preds, class_preds = preds
    start_labels, end_labels, class_labels = labels
    
    start_loss = nn.CrossEntropyLoss(ignore_index=-1)(start_preds, start_labels)
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_preds, end_labels)
    class_loss = nn.CrossEntropyLoss(ignore_index=-1)(class_preds, class_labels)
    return start_loss, end_loss, class_loss


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [13]:
def train(model, num_epochs, train_dataloader):
  losses_start = AverageMeter() 
  losses_end = AverageMeter() 
  losses_class = AverageMeter()
  accuracies_start = AverageMeter()
  accuracies_end = AverageMeter() 
  accuracies_class = AverageMeter()
  model.train()

  for epoch in range(num_epochs):
    for j,(batch_input_ids, batch_attention_mask, batch_token_type_ids, batch_y_start, batch_y_end, batch_y) in tqdm(enumerate(train_dataloader)):
      batch_input_ids, batch_attention_mask, batch_token_type_ids, labels1, labels2, labels3 = \
      batch_input_ids.cuda(), batch_attention_mask.cuda(), batch_token_type_ids.cuda(), batch_y_start.cuda(), batch_y_end.cuda(), batch_y.cuda()

      logits1, logits2, logits3 = model(batch_input_ids, batch_attention_mask, batch_token_type_ids)
      #y_true = (batch_y_start, batch_y_end, batch_y)
      loss1, loss2, loss3 = loss_fn((logits1, logits2, logits3), (labels1, labels2, labels3))
      loss = loss1+loss2+loss3
      acc1, n_position1 = get_position_accuracy(logits1, labels1)
      acc2, n_position2 = get_position_accuracy(logits2, labels2)
      acc3, n_position3 = get_position_accuracy(logits3, labels3)

      losses1.update(loss1.item(), n_position1)
      losses2.update(loss2.item(), n_position2)
      losses3.update(loss3.item(), n_position3)
      accuracies1.update(acc1, n_position1)
      accuracies2.update(acc2, n_position2)
      accuracies3.update(acc3, n_position3)

      optimizer.zero_grad()

      # with amp.scale_loss(loss, optimizer) as scaled_loss:
      #     scaled_loss.backward()
      loss.backward() 
      optimizer.step()
    print('epoch: {}, train_loss1: {}, train_loss2: {}, train_loss3: {}, train_acc1: {}, train_acc2: {}, train_acc3: {}'.format(epoch,losses1.avg,losses2.avg,losses3.avg,accuracies1.avg,accuracies2.avg,accuracies3.avg), flush=True)

    out_dir = 'models/'
    if not os.path.exists(out_dir):
      os.makedirs(out_dir)
    torch.save(model.module.state_dict(), out_dir+'model_'+epoch+'.pth')

In [16]:
def main():
  ''' Create data dictionary with one positive and one negative answer candidate per question
      Per epoch we train over 2*N instances given N is number of questions in train set.
  '''
  train_json_file = 'tiny-dev/simplified-dev-sample.jsonl'

  ids = []
  data_dict = {}
  with open(train_json_file) as f:
    for n, line in tqdm(enumerate(f)):
      data = json.loads(line)
      data_id = data['example_id']
      ids.append(data_id)
      doc_words = data['document_text'].split() 

      # To find the positive candidate for the question
      # Positive candidate is a long answer candidate which is also one of the annotated answers
      annotations = data['annotations']
      positive_candidate_idx = 0
      annotation_idx = -1
      for i, annotation in enumerate(annotations):
        if annotation['long_answer']['candidate_index'] != -1:
          annotation_idx = i
          positive_candidate_idx = annotation['long_answer']['candidate_index']
          break
      candidate = data['long_answer_candidates'][positive_candidate_idx]
      positive_candidate_start = candidate['start_token']
      positive_candidate_end = candidate['end_token']
      positive_candidate_words = doc_words[positive_candidate_start:positive_candidate_end]       
      
      # To find the negative candidate for the question
      # Negative candidate is a long answer candidate which very likely isnt one of the annotated answers
      num_long_answer_candidates = len(data['long_answer_candidates'])
      negative_candidate_idx = np.random.randint(num_long_answer_candidates)
      if negative_candidate_idx == positive_candidate_idx:
        negative_candidate_idx = negative_candidate_idx - 1 if negative_candidate_idx == num_long_answer_candidates-1 \
                                                            else negative_candidate_idx + 1
      candidate = data['long_answer_candidates'][negative_candidate_idx]
      negative_candidate_start = candidate['start_token']
      negative_candidate_end = candidate['end_token']
      negative_candidate_words = doc_words[negative_candidate_start:negative_candidate_end]

      # Adding these 2 instances (1 positive + 1 negative) for a question to data_dict
      data_dict[data_id] = {'question_text': data['question_text'],
                            'annotations': data['annotations'],
                            'annotation_idx': annotation_idx,  
                            'positive_text': positive_candidate_words,
                            'positive_start': positive_candidate_start,  
                            'positive_end': positive_candidate_end,   
                            'negative_text': negative_candidate_words,       
                            'negative_start': negative_candidate_start,  
                            'negative_end': negative_candidate_end               
                           }

    # Hyperparameters
    max_seq_len = 360
    max_question_len = 64
    learning_rate = 2e-5
    batch_size = 3
    num_epochs = 4

    # List of HTML tokens to be added to the vocab
    new_tokens = {'<P>':'qw1',
                  '<Table>':'qw2',
                  '<Tr>':'qw3',
                  '<Ul>':'qw4',
                  '<Ol>':'qw5',
                  '<Fl>':'qw6',
                  '<Li>':'qw7',
                  '<Dd>':'qw8',
                  '<Dt>':'qw9'}

    # Instantiating model
    model_path = "bert-large-uncased-whole-word-masking-finetuned-squad"
    config_file = BertConfig.from_pretrained(model_path)
    config_file.num_labels = 2       # 2 labels for 'long answer' or 'no answer'
    config_file.vocab_size = 30522   # 30522 + 9 HTML tokens later
    tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
    model = BertForQuestionAnswering.from_pretrained(model_path, config=config_file)

    # Add HTML tokens to tokenizer
    tokenizer.add_tokens(list(new_tokens.values()))
    model.resize_token_embeddings(len(tokenizer))

    model.cuda()

    train_dataset = NQDataset(id_list=ids)
    collate_func = Collator(data_dict=data_dict, 
                            new_token_dict=new_tokens, 
                            tokenizer=tokenizer, 
                            max_seq_len=max_seq_len, 
                            max_ques_len=max_question_len)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  collate_fn=collate_func,
                                  batch_size=batch_size,
                                  num_workers=1,
                                  pin_memory=True)
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training
    train(model=model, num_epochs=num_epochs, train_dataloader=train_dataloader)

In [19]:
main()


0it [00:00, ?it/s][A
56it [00:00, 532.58it/s][A
110it [00:00, 534.20it/s][A
200it [00:00, 622.82it/s]
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

0it [00:00, ?it/s]


RuntimeError: ignored

In [1]:
candidate_words = None

print(candidate_words)
for i, word in enumerate(candidate_words):
  if re.match(r'<.+>', word):
    if word in self.new_token_dict: 
      candidate_words[i] = self.new_token_dict[word]
    else:
      candidate_words[i] = '<'

None


TypeError: ignored