In [None]:
pip install transformers

In [3]:
%cd drive/My\ Drive/NQ\ Challenge
!ls

[Errno 2] No such file or directory: 'drive/My Drive/NQ Challenge'
/content/drive/My Drive/NQ Challenge
bert-joint-baseline  run_nq.py		   tiny-dev
bert_model_output    test.ipynb		   v1.0_sample_nq-dev-sample.jsonl
models		     test_smalldata.ipynb  v1.0_sample_nq-train-sample.jsonl
__pycache__	     test_utils.py	   v1.0-simplified_nq-dev-all.jsonl


In [None]:
!sudo pip3 install bert-tensorflow natural-questions --no-dependencies

Simplifying data using test_utils.py

In [3]:
pip install jsonlines

Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/jsonlines-1.2.0-py2.py3-none-any.whl
Installing collected packages: jsonlines
Successfully installed jsonlines-1.2.0


In [5]:
from tqdm import tqdm
import json
import numpy as np
from transformers import BertModel, BertConfig, BertTokenizer, BertForQuestionAnswering, BertPreTrainedModel
from transformers import AutoModel, AutoConfig, AutoTokenizer, AutoModelForQuestionAnswering
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
'''MAKE THIS SEPARATE FILE TO SIMPLIFY DATA'''
'''Getting simplified data from nq-dev-sample.no-annot.jsonl and nq-dev-sample.jsonl.gz [tiny dataset]'''

import gzip
from test_utils import simplify_nq_example
import jsonlines

# simplify tiny data input samples
tiny_json_input_file = 'tiny-dev/nq-dev-sample.no-annot.jsonl'

tiny_data = [simplify_nq_example(json.loads(line)) for line in open(tiny_json_input_file, 'r')]

with jsonlines.open('tiny-dev/simplified-dev-sample.no-annot.jsonl', 'w') as f:
  f.write_all(tiny_data)

# simplify tiny data gold labels
tiny_json_gold_file = 'tiny-dev/nq-dev-sample.jsonl.gz'

tiny_d = [simplify_nq_example(json.loads(line)) for line in gzip.open(tiny_json_gold_file, 'r')]

with jsonlines.open('tiny-dev/simplified-dev-sample.jsonl', 'w') as f:
  f.write_all(tiny_d)

In [29]:
tiny_d[10].keys

{'annotations': [{'annotation_id': 10957934160137332476,
   'long_answer': {'candidate_index': -1, 'end_token': -1, 'start_token': -1},
   'short_answers': [],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 2807704282985816749,
   'long_answer': {'candidate_index': -1, 'end_token': -1, 'start_token': -1},
   'short_answers': [],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 5129692602407601925,
   'long_answer': {'candidate_index': 0, 'end_token': 122, 'start_token': 43},
   'short_answers': [],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 4965838886380681126,
   'long_answer': {'candidate_index': 90,
    'end_token': 4593,
    'start_token': 4485},
   'short_answers': [{'end_token': 4570, 'start_token': 4569},
    {'end_token': 4572, 'start_token': 4571}],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 9518639313383123593,
   'long_answer': {'candidate_index': -1, 'end_token': -1, 'start_token': -1},
   'short_answers': [],
   'yes_no_answer': 'NONE'}],
 'document_text': 'S

In [16]:
'''Create structures to hold inputs'''

input_file = 'tiny-dev/simplified-dev-sample.no-annot.jsonl'

# list of tuples of (example id, candidate id) for each candidate in each example 
exid_candid = []
# list of candidate lengths
candidate_lens = []
# mapping of (example id, candidate id) to length of that candidate
exid_candid2candlen = {}
# list of example ids
ids = []
data_dict = {}


with open(input_file) as f:
  for n, line in tqdm(enumerate(f)):
    data = json.loads(line)
    data_id = data['example_id']
    ids.append(data_id)

    data_dict[data_id] = {'document_text': data['document_text'],
                          'question_text': data['question_text'],
                          'long_answer_candidates': data['long_answer_candidates']}
    
    question_len = len(data['question_text'].split())

    for i, candidate in enumerate(data['long_answer_candidates']):
      exid_candid.append((data_id, i))
      candidate_len = question_len + candidate['end_token'] - candidate['start_token']
      candidate_lens.append(candidate_len)
      exid_candid2candlen[(data_id, i)] = candidate_len

200it [00:03, 53.49it/s]


In [17]:
# Sorting the list of (example id, candidate id) by candidate length for faster inference
sorting_idx = np.argsort(np.array(candidate_lens))

exid_candid_sorted = []
for idx in sorting_idx:
  exid_candid_sorted.append(exid_candid[idx])

Hyperparameters

In [18]:
max_seq_len = 360
max_question_len = 64
batch_size = 50

In [19]:
# List of HTML tokens to be added to the vocab

new_tokens = {'<P>':'qw1',
              '<Table>':'qw2',
              '<Tr>':'qw3',
              '<Ul>':'qw4',
              '<Ol>':'qw5',
              '<Fl>':'qw6',
              '<Li>':'qw7',
              '<Dd>':'qw8',
              '<Dt>':'qw9'}

**DATASET, COLLATOR & DATALOADER**

In [20]:
import re

class NQDataset(Dataset):
  def __init__(self, ids):
    self.ids = ids
  def __len__(self):
    return len(self.ids)
  def __getitem__(self, index):
    return self.ids[index]

class Collator(object):
  def __init__(self, data_dict, new_token_dict, tokenizer, max_seq_len=384, max_ques_len=64):
    self.data_dict = data_dict
    self.new_token_dict = new_token_dict
    self.tokenizer = tokenizer
    self.max_seq_len = max_seq_len
    self.max_ques_len = max_ques_len

  def get_sample(self, data_id, candidate_idx):
    data = self.data_dict[data_id]
    question_tokens = self.tokenizer.tokenize(data['question_text'])[:self.max_ques_len]
    data_words = data['document_text'].split()

    max_ans_len = self.max_seq_len - len(question_tokens) - 3
    candidate = data['long_answer_candidates'][candidate_idx]
    candidate_start = candidate['start_token']
    candidate_end = candidate['end_token']
    candidate_words = data_words[candidate_start:candidate_end]


    for i, word in enumerate(candidate_words):
      # if this doesnt work, then remove new_token_dict and simply replace all HTML tags with '<'
      if re.match(r'<.+>', word):
        if word in self.new_token_dict:
          candidate_words[i] = self.new_token_dict[word]
        else:
          candidate_words[i] = '<'

    words2tokens_idx = []
    candidate_tokens = []
    for i, word in enumerate(candidate_words):
      words2tokens_idx.append(len(candidate_tokens))
      tokens = self.tokenizer.tokenize(word)
      if (len(candidate_tokens) + len(tokens)) > max_ans_len:
        break
      candidate_tokens.extend(tokens)

    input_tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + candidate_tokens + ['[SEP]']
    input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)

    return input_ids, candidate_start, candidate_end, words2tokens_idx, len(input_ids), len(question_tokens)+2  #cls+question+sep

  def __call__(self, batch_ids):
    batch_size = len(batch_ids)
    temp_batch_input_ids = []
    batch_seq_len = []
    batch_offset = []
    batch_words2tokens_idx = []
    batch_start_tokens = []
    batch_end_tokens = []

    for i, (data_id, candidate_idx) in enumerate(batch_ids):
      input_ids, start_token, end_token words2tokens_idx, seq_len, offset = self.get_sample(data_id, candidate_idx)
      batch_input_ids_temp.append(input_ids)
      batch_start_tokens.append(start_token)
      batch_end_tokens.append(end_token)
      batch_seq_len.append(seq_len)
      batch_offset.append(offset)
      batch_words2tokens_idx.append(words2tokens_idx)

    batch_max_seq_len = max(batch_seq_len)
    batch_input_ids = np.zeros((batch_size, batch_max_seq_len), dtype=np.int64)
    batch_token_type_ids = np.zeros((batch_size, batch_max_seq_len), dtype=np.int64)

    for i in range(batch_ids):
      input_ids = batch_input_ids_temp[i]
      batch_input_ids[i, :len(input_ids)] = input_ids
      SEP_ID = self.tokenizer.convert_tokens_to_ids('[SEP]')
      # to get in BERT format of 0s and 1s for 2 sentence-inputs
      batch_token_type_ids[i, :len(input_ids)] = [0 if k<=input_ids.index(SEP_ID) else 1 for k in range(len(input_ids))]

    batch_attention_mask = batch_input_ids > 0

    return torch.from_numpy(batch_input_ids), torch.from_numpy(batch_attention_mask), torch.from_numpy(batch_token_type_ids), batch_words2tokens_idx, batch_offset, batch_max_seq_len

**MODEL**

In [25]:
class BertForQuestionAnswering(BertPreTrainedModel):
  def __init__(self, config):
    super(BertForQuestionAnswering, self).__init__(config)
    self.num_labels = config.num_labels
    self.bert = BertModel(config)
    self.qa_outputs = nn.Linear(config.hidden_size, 2)
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    self.init_weights()

  def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
    out = self.bert(input_ids, 
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask)
    
    seq_output = out[0]
    pooled_output = out[1]

    qa_logits = self.qa_outputs(seq_output)
    start_logits, end_logits = qa_logits.split(1, dim=-1)
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)

    pooled_output = self.dropout(pooled_output)
    classifier_logits = self.classifier(pooled_output)

    return start_logits, end_logits, classifier_logits

In [None]:
''' Keep this for when using classifier layer'''

model_path = "bert-large-uncased-whole-word-masking-finetuned-squad"
config_file = BertConfig.from_pretrained(model_path)
# think about keeping labels at 2 for 'long answer' or 'no answer' 
config_file.num_labels = 5       # 5 labels for 'long answer', 'short answer', 'yes', 'no' and 'no answer'
config_file.vocab_size = 30522   # 30522 + 9 HTML tokens
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
model = BertForQuestionAnswering.from_pretrained(model_path, config=config_file)

In [None]:
''' Use this to only use AutoModel '''

model_path = "bert-large-uncased-whole-word-masking-finetuned-squad"
config_file = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=True)
model = AutoModelForQuestionAnswering.from_pretrained(model_path, config=config_file)

In [None]:
model.to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-5)

print("Added ", tokenizer.add_tokens(list(new_tokens.values())), "tokens")
model.resize_token_embeddings(len(tokenizer))

**RUN EVAL**

In [38]:
eval_dataset = NQDataset(exid_candid_sorted)
eval_collate = Collator(data_dict=data_dict, 
                        new_token_dict=new_tokens,
                        tokenizer=tokenizer,
                        max_seq_len=max_seq_len,
                        max_ques_len=max_question_len)
eval_dataloader = DataLoader(dataset=eval_dataset,
                             collate_fn=eval_collate,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=8,
                             pin_memory=True)

In [36]:
model.eval()

start_probs = np.zeros((len(exid_candid_sorted), max_seq_len), dtype=np.float32)
end_probs = np.zeros((len(exid_candid_sorted), max_seq_len), dtype=np.float32)
class_probs = np.zeros((len(exid_candid_sorted), config_file.num_labels), dtype=np.float32)

for i, (batch_input_ids, batch_attention_mask, batch_token_type_ids, batch_words2tokens_idx, batch_offset, batch_max_seq_len)) in tqdm(enumerate(eval_dataloader)):
  with torch.no_grad():
    start = i * batch_size
    if i == len(eval_dataloader)-1:
      end = len(eval_dataloader.dataset)
    else:
      end = start + batch_size
    batch_input_ids, batch_attention_mask, batch_token_type_ids = batch_input_ids.cuda(), batch_attention_mask.cuda(), batch_token_type_ids.cuda()
    ''' Use this section for when using classifier layer '''
    # start_logits, end_logits, class_logits = model(batch_input_ids, batch_attention_mask, batch_token_type_ids)
    # start_probs[start:end, :batch_max_seq_len] += F.softmax(start_logits, dim=1).cpu().data.numpy()
    # end_probs[start:end, :batch_max_seq_len] += F.softmax(end_logits, dim=1).cpu().data.numpy()
    # class_probs[start:end] += F.softmax(class_logits, dim=1).cpu().data.numpy()
    ''' Use this section with AutoModel '''
    

200

Output at the end of this is start probs, end probs and class probs

In [None]:
# To find the starting word and ending word, choose word with highest probability

start_label = np.argmax(start_prob, axis=1)
end_label = np.argmax(end_prob, axis=1)

**Preparing to store predictions**

In [None]:
'''initialize a temporary dictionary to store prediction values.'''

temp_dict = {}
for doc_id in ids:
    temp_dict[doc_id] = {
                         'long_answer': {'start_token': -1, 'end_token': -1},
                         'long_answer_score': -1.0,
                         'short_answers': [{'start_token': -1, 'end_token': -1}],
                         'short_answers_score': -1.0,
                         'yes_no_answer': 'NONE'
                        }

In [None]:
for i, (doc_id, candidate_idx) in tqdm(enumerate(exid_candid_sorted)):
  long_ans_score = 1.0 - class_probs[i, 0]
  if long_answer_score > temp_dict[doc_id]['long_answer_score']:
    temp_dict[doc_id]['long_answer_score'] = long_ans_score
    temp_dict[doc_id]['long_answer']['start_token'] = data_dict[doc_id]['long_answer_candidates'][candidate_idx]['start_token']
    temp_dict[doc_id]['long_answer']['end_token'] = data_dict[doc_id]['long_answer_candidates'][candidate_idx]['end_token']

In [None]:
# Preparing final dict in expected predictions format

final_dict = {}
final_dict['predictions'] = []

for doc_id in ids:
  pred_dict = {                       
                'example_id': doc_id,
                'long_answer': {'start_byte': -1, 'end_byte': -1, 'start_token': temp_dict[doc_id]['long_answer']['start_token'], 'end_token': temp_dict[doc_id]['long_answer']['end_token']},
                'long_answer_score': temp_dict[doc_id]['long_answer_score'],
                'short_answers': [{'start_byte': -1, 'end_byte': -1, 'start_token': temp_dict[doc_id]['short_answers'][0]['start_token'], 'end_token': temp_dict[doc_id]['short_answers'][0]['end_token']}],
                'short_answers_score': temp_dict[doc_id]['short_answers_score'],
                'yes_no_answer': temp_dict[doc_id]['yes_no_answer']
              }
  final_dict['predictions'].append(pred_dict)

In [None]:
# Dump to JSON file
with open('predictions.json', 'w') as f:
    json.dump(final_dict, f)
  
print("--- %s seconds ---" % (time.time() - start_time))