## **0. Settings**

In [1]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 16.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 56.1MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 52.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=354279

In [2]:
#import used library
import json
import numpy as np
import torch
import os
import textwrap
import time
import datetime
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split


In [3]:
device = torch.device("cuda")

## **1. Import the data**

In [4]:
with open(os.path.join('./data/train-v1.1.json'), "r", encoding="utf-8") as reader:
    input_data = json.load(reader)["data"]

dataset = []

for sample in input_data:
    # Title of the the article
    title = sample['title']
    # each article is divided into many paragraphs
    paragraphs = sample['paragraphs']
    # for each paragraph in the article
    for paragraph in paragraphs:
        # text of the paragraph
        context = paragraph['context']
        qas = paragraph['qas']
        for qa in qas:
            # to save each sample
            sample = {}
            sample["title"] = title
            sample["context"] = context
            # the question
            sample["question"] = qa['question']
            sample["id"] = qa['id']
            answer = qa['answers'][0]
            sample["answer_text"] = answer['text']

            sample["answer_start"] = answer['answer_start']
            # pass the sample to the dataset
            dataset.append(sample)

In [5]:
# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80) 

# Select an example to check out.
ex = dataset[888]

print('Title:', ex['title'])
print('ID:', ex['id'])

print('\n======== Question =========')
print(ex['question'])

print('\n======== Context =========')
print(wrapper.fill(ex['context']))

print('\n======== Answer =========')
print(ex['answer_text'])

Title: Beyoncé
ID: 56becb8d3aeaaa14008c9499

Who was the first female to achieve the International Artist Award at the American Music Awards?

Beyoncé has received numerous awards. As a solo artist she has sold over 15
million albums in the US, and over 118 million records worldwide (a further 60
million additionally with Destiny's Child), making her one of the best-selling
music artists of all time. The Recording Industry Association of America (RIAA)
listed Beyoncé as the top certified artist of the 2000s, with a total of 64
certifications. Her songs "Crazy in Love", "Single Ladies (Put a Ring on It)",
"Halo", and "Irreplaceable" are some of the best-selling singles of all time
worldwide. In 2009, The Observer named her the Artist of the Decade and
Billboard named her the Top Female Artist and Top Radio Songs Artist of the
Decade. In 2010, Billboard named her in their "Top 50 R&B/Hip-Hop Artists of the
Past 25 Years" list at number 15. In 2012 VH1 ranked her third on their list of
th

In [6]:
from transformers import BertTokenizer

# Load the tokenizer for our model.
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-cased',
    do_lower_case=False
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [7]:
for elem in dataset[0]:
  print(elem)

title
context
question
id
answer_text
answer_start


In [8]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [9]:
# Track the time. Tokenizing all training examples takes around 3 minutes.
t0 = time.time()

all_input_ids = []
attention_masks = []
segment_ids = [] 
start_positions = []
end_positions = []

num_dropped = 0

update_interval = 10000

print('Tokenizing {:,} examples...'.format(len(dataset)))

for (ex_num, ex) in enumerate(dataset):

    if (ex_num % update_interval) == 0 and not (ex_num == 0):

        elapsed = format_time(time.time() - t0)
        
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(dataset) - ex_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(dataset), elapsed, remaining))


    # Tokenize the answer--it may be broken into multiple words and/or subwords.
    answer_tokens = tokenizer.tokenize(ex['answer_text'])

    # Create our sentinel string, e.g., "[MASK] [MASK] [MASK]"
    sentinel_str = ' '.join(['[MASK]']*len(answer_tokens))

    start_char_i = ex['answer_start']
    end_char_i = start_char_i + len(ex['answer_text'])

    # To make the replacement, we use slicing and string concatenation.
    context_w_sentinel = ex['context'][:start_char_i] + \
                         sentinel_str + \
                         ex['context'][end_char_i:]

    encoded_dict = tokenizer.encode_plus(
        ex['question'], 
        context_w_sentinel,
        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
        max_length = 384,       # Pad & truncate all sentences.
        pad_to_max_length = True,
        truncation = True,
        return_attention_mask = True, # Construct attention masks.
        return_tensors = 'pt',        # Return pytorch tensors.
    )

    # Retrieve the encoded sequence.
    input_ids = encoded_dict['input_ids']

    # =============================
    #     Locate Answer Tokens
    # =============================

    # First, compare all of the tokens to the mask token. 
    is_mask_token = (input_ids[0] == tokenizer.mask_token_id)

    # Then get the indeces of the '1's using the `nonzero` function.
    mask_token_indeces = is_mask_token.nonzero(as_tuple=False)[:, 0]

    # Make sure the number of MASK tokens we found is the same as the number of
    # answer tokens. If not, we presumably lost the answer due to truncation.
    if not len(mask_token_indeces) == len(answer_tokens):
        
        # Tally the number of training samples that we skip due to this issue.
        num_dropped += 1

        continue


    start_index = mask_token_indeces[0]
    end_index = mask_token_indeces[-1]

    # =============================
    #     Restore Answer Tokens
    # =============================

    # Encode the answer tokens (to token ids).
    answer_token_ids = tokenizer.encode(answer_tokens, 
                                        add_special_tokens=False, 
                                        return_tensors='pt')

    # Restore the answer within the reference text. (Replace the `[MASK]` tokens
    # with the answer tokens)
    input_ids[0, start_index : end_index + 1] = answer_token_ids

    # =============================
    #     Store Encoded Sample
    # =============================

    # Add the encoded sentence to the list.    
    all_input_ids.append(input_ids)

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])    

    # Store the segment IDs, which indicate which tokens belong to the question
    # vs. the context.
    segment_ids.append(encoded_dict['token_type_ids'])

    # Store the start and end indeces of the correct answer.
    start_positions.append(start_index)
    end_positions.append(end_index)


# =========================
#        Wrap-Up
# =========================

# Convert the lists of tensors into 2D tensors.
all_input_ids = torch.cat(all_input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
segment_ids = torch.cat(segment_ids, dim=0)

# Convert the "labels" (the start and end indeces) into tensors.
start_positions = torch.tensor(start_positions)
end_positions = torch.tensor(end_positions)

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Tokenizing 87,599 examples...




  Example  10,000  of   87,599.    Elapsed: 0:00:23. Remaining: 0:03:02
  Example  20,000  of   87,599.    Elapsed: 0:00:43. Remaining: 0:02:26
  Example  30,000  of   87,599.    Elapsed: 0:01:06. Remaining: 0:02:07
  Example  40,000  of   87,599.    Elapsed: 0:01:32. Remaining: 0:01:49
  Example  50,000  of   87,599.    Elapsed: 0:01:57. Remaining: 0:01:28
  Example  60,000  of   87,599.    Elapsed: 0:02:23. Remaining: 0:01:06
  Example  70,000  of   87,599.    Elapsed: 0:02:48. Remaining: 0:00:42
  Example  80,000  of   87,599.    Elapsed: 0:03:13. Remaining: 0:00:18
DONE.  Tokenization took 0:03:33


In [10]:
print('Tokenized {:,} examples.'.format(len(all_input_ids)))

print('\nDropped {:,} examples.'.format(num_dropped))

Tokenized 87,502 examples.

Dropped 97 examples.


In [11]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(all_input_ids, 
                        attention_masks, 
                        segment_ids, 
                        start_positions, 
                        end_positions)

print('Dataset size: {:} samples'.format(len(dataset)))

Dataset size: 87502 samples


In [12]:
# Calculate the number of samples to include in each set.
train_size = int(0.98 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

85,751 training samples
1,751 validation samples


In [13]:
from torch.utils.data import DataLoader, RandomSampler, SubsetRandomSampler, SequentialSampler

import numpy.random
import numpy as np

batch_size = 16

# Randomly select 10,000 indeces from the training set to use. 
#indeces = np.random.permutation(len(train_dataset))[:10000]

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            #sampler = SubsetRandomSampler(indeces, train_dataset),
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

print('{:,} training batches & {:,} validation batches'.format(len(train_dataloader), len(validation_dataloader)))

5,360 training batches & 110 validation batches


In [14]:

model = BertForQuestionAnswering.from_pretrained(
    "bert-base-cased", 
    output_attentions = False, 
    output_hidden_states = False
)

desc = model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

In [15]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [16]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []


# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    
    print('Training {:,} batches...'.format(len(train_dataloader)))

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # Pick an interval on which to print progress updates.
    update_interval = 100


    # The total number of batches per epoch.
    num_batches = len(train_dataloader)

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every, e.g., 500 batches.
        if step % update_interval == 0 and not step == 0:
            
            # Calculate elapsed time and format it.
            elapsed = format_time(time.time() - t0)
            
            # Calculate the time remaining based on our progress.
            step_per_sec = (time.time() - t0) / step
            remaining_sec = step_per_sec * (num_batches - step)
            remaining = format_time(remaining_sec)

            # Report progress.
            print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(step, num_batches, elapsed, remaining))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_seg_ids = batch[2].to(device)
        b_start_pos = batch[3].to(device)
        b_end_pos = batch[4].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        token_type_ids = b_seg_ids,
                        start_positions=b_start_pos,
                        end_positions=b_end_pos)

        # You can see the outputs in the source code here:
        # https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py#L1601
        # 
        # The forward pass returns the loss, start_logits, and end_logits.
        #(loss, start_logits, end_logits) = outputs
        loss = outputs[0]
        start_logits = outputs[1]
        end_logits = outputs[2]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    t0_val = time.time()

    # Tracking results. 
    pred_start, pred_end, true_start, true_end = [], [], [], []

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_seg_ids = batch[2].to(device)
        b_start_pos = batch[3].to(device)
        b_end_pos = batch[4].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            outputs = model(b_input_ids, 
                            token_type_ids=b_seg_ids, 
                            attention_mask=b_input_mask,
                            start_positions=b_start_pos,
                            end_positions=b_end_pos)

        # You can see the outputs in the source code here:
        # https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py#L1601
        # 
        # The forward pass returns the loss, start_logits, and end_logits.
        loss = outputs[0]
        start_logits = outputs[1]
        end_logits = outputs[2]
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        start_logits = start_logits.detach().cpu().numpy()
        end_logits = end_logits.detach().cpu().numpy()
        
        # Move the correct start and end positions back to the CPU.
        b_start_pos = b_start_pos.to('cpu').numpy()
        b_end_pos = b_end_pos.to('cpu').numpy()

        # Find the tokens with the highest `start` and `end` scores.
        answer_start = np.argmax(start_logits, axis=1)
        answer_end = np.argmax(end_logits, axis=1)

        # Store predictions and true labels
        pred_start.append(answer_start)
        pred_end.append(answer_end)
        true_start.append(b_start_pos)
        true_end.append(b_end_pos)

    # Combine the results across the batches.
    pred_start = np.concatenate(pred_start, axis=0)
    pred_end = np.concatenate(pred_end, axis=0)
    true_start = np.concatenate(true_start, axis=0)
    true_end = np.concatenate(true_end, axis=0)
        
    # Count up the number of start index predictions and end index predictions 
    # which match the correct indeces.
    num_start_correct = np.sum(pred_start == true_start)
    num_end_correct = np.sum(pred_end == true_end)

    total_correct = num_start_correct + num_end_correct
    total_indeces = len(true_start) + len(true_end)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = float(total_correct) / float(total_indeces)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0_val)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")


Training 5,360 batches...
  Batch     100  of    5,360.    Elapsed: 0:02:00. Remaining: 1:45:10
  Batch     200  of    5,360.    Elapsed: 0:04:07. Remaining: 1:46:24
  Batch     300  of    5,360.    Elapsed: 0:06:16. Remaining: 1:45:40
  Batch     400  of    5,360.    Elapsed: 0:08:24. Remaining: 1:44:09
  Batch     500  of    5,360.    Elapsed: 0:10:32. Remaining: 1:42:27
  Batch     600  of    5,360.    Elapsed: 0:12:41. Remaining: 1:40:33
  Batch     700  of    5,360.    Elapsed: 0:14:49. Remaining: 1:38:36
  Batch     800  of    5,360.    Elapsed: 0:16:57. Remaining: 1:36:36
  Batch     900  of    5,360.    Elapsed: 0:19:05. Remaining: 1:34:33
  Batch   1,000  of    5,360.    Elapsed: 0:21:13. Remaining: 1:32:31
  Batch   1,100  of    5,360.    Elapsed: 0:23:21. Remaining: 1:30:26
  Batch   1,200  of    5,360.    Elapsed: 0:25:30. Remaining: 1:28:23
  Batch   1,300  of    5,360.    Elapsed: 0:27:37. Remaining: 1:26:16
  Batch   1,400  of    5,360.    Elapsed: 0:29:46. Remaining: 1

In [18]:
import json

# Open the training dataset file.
with open(os.path.join('./data/dev-v1.1.json'), "r", encoding="utf-8") as reader:
    input_data = json.load(reader)["data"]


print_count = 0

print('Unpacking SQuAD Examples...')

print('Articles:')

# We'll unpack all of the 
examples = []

# For each Wikipedia article in the dataset...
for entry in input_data:

    # The Wikipedia Article title.
    title = entry["title"]
    print('  ', title)

    # The article contains multiple paragraphs...
    for paragraph in entry["paragraphs"]:
        
        # The paragraph, where the answer is found, is referred to as the
        # "context".
        context_text = paragraph["context"]
        
        # There can be multiple questions per paragraph.
        for qa in paragraph["qas"]:
            
            # Define a dictionary to store the properties.
            ex = {}

            # The unique ID of this question.
            ex['qas_id'] = qa["id"]

            # The question.
            ex['question_text'] = qa["question"]

            # In the test data, there are three answers per question, so we'll
            # store all three. 
            # Each answer has two fields: `answer_start` and `text`.
            ex['answers'] = qa["answers"]

            # Store the title and paragraph text.
            ex['title'] = title
            ex['context_text'] = context_text

            examples.append(ex)

print('DONE!')

Unpacking SQuAD Examples...
Articles:
   Super_Bowl_50
   Warsaw
   Normans
   Nikola_Tesla
   Computational_complexity_theory
   Teacher
   Martin_Luther
   Southern_California
   Sky_(United_Kingdom)
   Victoria_(Australia)
   Huguenot
   Steam_engine
   Oxygen
   1973_oil_crisis
   Apollo_program
   European_Union_law
   Amazon_rainforest
   Ctenophora
   Fresno,_California
   Packet_switching
   Black_Death
   Geology
   Newcastle_upon_Tyne
   Victoria_and_Albert_Museum
   American_Broadcasting_Company
   Genghis_Khan
   Pharmacy
   Immune_system
   Civil_disobedience
   Construction
   Private_school
   Harvard_University
   Jacksonville,_Florida
   Economic_inequality
   Doctor_Who
   University_of_Chicago
   Yuan_dynasty
   Kenya
   Intergovernmental_Panel_on_Climate_Change
   Chloroplast
   Prime_number
   Rhine
   Scottish_Parliament
   Islamism
   Imperialism
   United_Methodist_Church
   French_and_Indian_War
   Force
DONE!


In [20]:
def good_update_interval(total_iters, num_desired_updates):
    '''
    This function will try to pick an intelligent progress update interval 
    based on the magnitude of the total iterations.

    Parameters:
      `total_iters` - The number of iterations in the for-loop.
      `num_desired_updates` - How many times we want to see an update over the 
                              course of the for-loop.
    '''
    # Divide the total iterations by the desired number of updates. Most likely
    # this will be some ugly number.
    exact_interval = total_iters / num_desired_updates

    # The `round` function has the ability to round down a number to, e.g., the
    # nearest thousandth: round(exact_interval, -3)
    #
    # To determine the magnitude to round to, find the magnitude of the total,
    # and then go one magnitude below that.

    # Get the order of magnitude of the total.
    order_of_mag = len(str(total_iters)) - 1

    # Our update interval should be rounded to an order of magnitude smaller. 
    round_mag = order_of_mag - 1

    # Round down and cast to an int.
    update_interval = int(round(exact_interval, -round_mag))

    # Don't allow the interval to be zero!
    if update_interval == 0:
        update_interval = 1

    return update_interval

In [21]:
import time
import torch

import logging

# By default, the tokenizer will spit out a warning whenever we tokenize a 
# sample which ends up being more than 512 tokens. We don't care about that for
# now, though, and this cell will produce a lot of those warnings! So we'll 
# adjust the logging settings to suppress those warnings and keep the output
# cell cleaner.
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

# Track the time. Tokenizing all training examples takes around 3 minutes.
t0 = time.time()

# Lists to store locations
start_positions = []
end_positions = []

# We'll count up the number of answers which are truncated, as well as the
# number of test samples for which all three answers were truncated (it's 
# impossible for us to answer these).
num_clipped_answers = 0
num_impossible = 0

# Pick an interval on which to print progress updates.
update_interval = good_update_interval(
            total_iters = len(examples), 
            num_desired_updates = 15
        )

print('Processing {:,} examples...'.format(len(examples)))

# For each of the training examples...
for (ex_num, ex) in enumerate(examples):

    # =====================
    #   Progress Update
    # =====================

    # Progress update every, e.g., 10k samples.
    if (ex_num % update_interval) == 0 and not (ex_num == 0):

        # Calculate elapsed time and format it.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(examples) - ex_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(examples), elapsed, remaining))

    # To store the start and end indeces of the three possible answers.
    start_options = []
    end_options = []

    # Flag to indicate whether we've saved the encoded form of the input yet.
    # We'll tokenize the input three times, but only need to store it once!
    encoded_stored = False

    # For each of the three possible answers...
    for answer in ex['answers']:

        # =============================
        #     Add Sentinel String
        # =============================
        # To help us determine which of the BERT tokens correspond to the answer,
        # we'll replace the answer with, e.g., "[MASK] [MASK] [MASK]" (based on 
        # the number of tokens in the answer).

        # Tokenize the answer--it may be broken into multiple words and/or subwords.
        answer_tokens = tokenizer.tokenize(answer['text'])

        # Create our sentinel string, e.g., "[MASK] [MASK] [MASK]"
        sentinel_str = ' '.join(['[MASK]']*len(answer_tokens))

        # Within the "context" string, replace the answer with our sentinel.
        # Python doesn't appear to have a built-in function for replacing a 
        # substring *starting at a specific index*, so we'll implement it in a 
        # more manual way.

        # Locate the exact start and end of the answer text within the "context"
        # string. The dataset gives us this information because the answer text
        # may occur more than once in the context!
        start_char_i = answer['answer_start']
        end_char_i = start_char_i + len(answer['text'])

        # To make the replacement, we use slicing and string concatenation.
        context_w_sentinel = ex['context_text'][:start_char_i] + \
                            sentinel_str + \
                            ex['context_text'][end_char_i:]

        # =============================
        #      Tokenize & Encode
        # =============================
        # Combine the question and the context strings and encode them.
        input_ids = tokenizer.encode(
            ex['question_text'], 
            context_w_sentinel,
            add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
            #max_length = max_len,       # Pad & truncate all sentences.
            pad_to_max_length = False,
            truncation = False,
        )

        # =============================
        #     Locate Answer Tokens
        # =============================
        # Locate all of the instances of the '[MASK]' token. 

        # Find all indeces of the [MASK] token.
        mask_token_indeces = np.where(np.array(input_ids) == tokenizer.mask_token_id)[0]

        # Note: You can use the alternate code below if the input_ids are in a 
        #       PyTorch tensor
        # First, compare all of the tokens to the mask token. 
        #is_mask_token = (input_ids[0] == tokenizer.mask_token_id)
        # Then get the indeces of the '1's using the `nonzero` function.
        #mask_token_indeces = is_mask_token.nonzero(as_tuple=False)[:, 0]

        # As a sanity check, make sure the number of MASK tokens we found is the
        # same as the number of answer tokens. 
        assert(len(mask_token_indeces) == len(answer_tokens))           

        # `mask_token_indeces` is the range of indeces (e.g., [68, 69, 70, 71]), 
        # but we really just want the start and end indeces (e.g., 68 and 71).
        start_index = mask_token_indeces[0]
        end_index = mask_token_indeces[-1]

        # Store these indeces in our lists.
        start_options.append(start_index)
        end_options.append(end_index)
    
    # Store the start and end indeces of the three possible correct answers.
    start_positions.append(start_options)
    end_positions.append(end_options)
    
    # Continue looping through all of the test samples.

# =========================
#        Wrap-Up
# =========================

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Processing 10,570 examples...
  Example   1,000  of   10,570.    Elapsed: 0:00:06. Remaining: 0:00:55
  Example   2,000  of   10,570.    Elapsed: 0:00:11. Remaining: 0:00:47
  Example   3,000  of   10,570.    Elapsed: 0:00:17. Remaining: 0:00:42
  Example   4,000  of   10,570.    Elapsed: 0:00:25. Remaining: 0:00:41
  Example   5,000  of   10,570.    Elapsed: 0:00:34. Remaining: 0:00:38
  Example   6,000  of   10,570.    Elapsed: 0:00:41. Remaining: 0:00:31
  Example   7,000  of   10,570.    Elapsed: 0:00:49. Remaining: 0:00:25
  Example   8,000  of   10,570.    Elapsed: 0:00:56. Remaining: 0:00:18
  Example   9,000  of   10,570.    Elapsed: 0:01:03. Remaining: 0:00:11
  Example  10,000  of   10,570.    Elapsed: 0:01:10. Remaining: 0:00:04
DONE.  Tokenization took 0:01:16


In [23]:
num_impossible = 0
num_clipped = 0

# For each of the test samples...
for (start_options, end_options) in zip(start_positions, end_positions):

    is_possible = False

    # For each of the three options...
    for i in range(0, len(start_options)):
        
        # If at least one of the possible answers is captured, then this test 
        # sample is possible.
        if (start_options[i] < 384) and (end_options[i] < 384):
            is_possible = True
        
        # Tally the number of answers (across all test samples) for which
        # the answer is partially or fully clipped by our truncation.
        if (start_options[i] > 384) or (end_options[i] > 384):
            num_clipped += 1

    # Tally the number with no available answers.
    if not is_possible:
        num_impossible += 1

print('')

print('Samples w/ all answers clipped: {:,} of {:,} ({:.2%})'.format(num_impossible, len(examples), float(num_impossible) / float(len(examples))))

addtl_clipped = num_clipped - (num_impossible * 3)
total_answers = len(examples) * 3
print('\n    Additional clipped answers: {:,} of {:,}'.format(addtl_clipped, total_answers))


Samples w/ all answers clipped: 31 of 10,570 (0.29%)

    Additional clipped answers: 19 of 31,710


In [25]:
import time
import torch

# Track the time. Tokenizing all training examples takes around 3 minutes.
t0 = time.time()

# Lists to store the encoded samples.
all_input_ids = []
attention_masks = []
segment_ids = [] 

# Pick an interval on which to print progress updates.
update_interval = good_update_interval(
            total_iters = len(examples), 
            num_desired_updates = 15
        )

print('Tokenizing {:,} examples...'.format(len(examples)))

# For each of the training examples...
for (ex_num, ex) in enumerate(examples):

    # =====================
    #   Progress Update
    # =====================

    # Progress update every, e.g., 10k samples.
    if (ex_num % update_interval) == 0 and not (ex_num == 0):

        # Calculate elapsed time and format it.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(examples) - ex_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(examples), elapsed, remaining))

    # =============================
    #      Tokenize & Encode
    # =============================
    # Combine the question and the context strings, and tokenize them all 
    # together.
    # `encode_plus` will:    
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Place an `[SEP]` token between the question and reference text, and 
    #       and at the end of the reference text.
    #   (4) Map tokens to their IDs ("encode" the text)
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    #   (7) Create the list of segment IDs, indicating which tokens belong
    #       to the question vs. the context.
    #   (8) Casts everything as PyTorch tensors.

    encoded_dict = tokenizer.encode_plus(
        ex['question_text'], 
        ex['context_text'],
        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
        max_length = 384,       # Pad & truncate all sentences.
        pad_to_max_length = True,
        truncation = True,
        return_attention_mask = True, # Construct attention masks.
        return_tensors = 'pt',        # Return pytorch tensors.
    )

    # Retrieve the encoded sequence.
    input_ids = encoded_dict['input_ids']

    # =============================
    #     Store Encoded Sample
    # =============================

    # Add the encoded sentence to the list.    
    all_input_ids.append(input_ids)

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])    

    # Store the segment IDs, which indicate which tokens belong to the question
    # vs. the context.
    segment_ids.append(encoded_dict['token_type_ids'])

    # ^^^ Continue looping through all of the test samples. ^^^

# =========================
#        Wrap-Up
# =========================

# Convert the lists of tensors into 2D tensors.
all_input_ids = torch.cat(all_input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
segment_ids = torch.cat(segment_ids, dim=0)

# We don't need the indeces to be tensors, since we're not doing training here.
# Convert the "labels" (the start and end indeces) into tensors.
#start_positions = torch.tensor(start_positions)
#end_positions = torch.tensor(end_positions)

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Tokenizing 10,570 examples...




  Example   1,000  of   10,570.    Elapsed: 0:00:02. Remaining: 0:00:19
  Example   2,000  of   10,570.    Elapsed: 0:00:04. Remaining: 0:00:16
  Example   3,000  of   10,570.    Elapsed: 0:00:06. Remaining: 0:00:15
  Example   4,000  of   10,570.    Elapsed: 0:00:08. Remaining: 0:00:13
  Example   5,000  of   10,570.    Elapsed: 0:00:11. Remaining: 0:00:12
  Example   6,000  of   10,570.    Elapsed: 0:00:13. Remaining: 0:00:10
  Example   7,000  of   10,570.    Elapsed: 0:00:16. Remaining: 0:00:08
  Example   8,000  of   10,570.    Elapsed: 0:00:18. Remaining: 0:00:06
  Example   9,000  of   10,570.    Elapsed: 0:00:20. Remaining: 0:00:04
  Example  10,000  of   10,570.    Elapsed: 0:00:22. Remaining: 0:00:01
DONE.  Tokenization took 0:00:24


In [26]:
import time
import numpy as np

# Prediction on test set

# Put model in evaluation mode
model.eval()

t0 = time.time()

# Tracking variables 
pred_start = []
pred_end = []

# Get the total number of test samples (not answers).
num_test_samples = all_input_ids.shape[0]

# We'll batch the samples to speed up processing. 
batch_size = 16

num_batches = int(np.ceil(num_test_samples / batch_size))

print('Evaluating on {:,} test batches...'.format(num_batches))

batch_num = 0

# Predict 
for start_i in range(0, num_test_samples, batch_size):
    
    # Report progress.
    if ((batch_num % 50) == 0) and not (batch_num == 0):

        # Calculate elapsed time and format it.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        batches_per_sec = (time.time() - t0) / batch_num
        remaining_sec = batches_per_sec * (num_batches - batch_num)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(batch_num, num_batches, elapsed, remaining))

    # Calculate the ending index for this batch.
    # `end_i` is equal to the index of the last sample in the batch, +1.
    end_i = min(start_i + batch_size, num_test_samples)

    # Select our batch inputs (`b` stands for batch here).
    b_input_ids = all_input_ids[start_i:end_i, :]
    b_attn_masks = attention_masks[start_i:end_i, :]
    b_seg_ids = segment_ids[start_i:end_i, :]   

    # Copy these to the GPU.
    b_input_ids = b_input_ids.to(device)
    b_attn_masks = b_attn_masks.to(device)
    b_seg_ids = b_seg_ids.to(device)
    
    # Telling the model not to compute or store the compute graph, saving memory 
    # and speeding up prediction
    with torch.no_grad():
        
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, 
                        attention_mask=b_attn_masks,
                        token_type_ids=b_seg_ids)
                        
    start_logits = outputs[0]
    end_logits = outputs[1]
    # Move logits and labels to CPU
    start_logits = start_logits.detach().cpu().numpy()
    end_logits = end_logits.detach().cpu().numpy()
    
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = np.argmax(start_logits, axis=1)
    answer_end = np.argmax(end_logits, axis=1)

    # Store predictions and true labels
    pred_start.append(answer_start)
    pred_end.append(answer_end)

    batch_num += 1

    # ^^^ Continue looping through the batches. ^^^

# Combine the results across the batches.
pred_start = np.concatenate(pred_start, axis=0)
pred_end = np.concatenate(pred_end, axis=0)

print('    DONE.')

print('\nEvaluation took {:.0f} seconds.'.format(time.time() - t0))

Evaluating on 661 test batches...
  Batch      50  of      661.    Elapsed: 0:00:22. Remaining: 0:04:31
  Batch     100  of      661.    Elapsed: 0:00:45. Remaining: 0:04:15
  Batch     150  of      661.    Elapsed: 0:01:07. Remaining: 0:03:49
  Batch     200  of      661.    Elapsed: 0:01:29. Remaining: 0:03:25
  Batch     250  of      661.    Elapsed: 0:01:51. Remaining: 0:03:03
  Batch     300  of      661.    Elapsed: 0:02:14. Remaining: 0:02:41
  Batch     350  of      661.    Elapsed: 0:02:36. Remaining: 0:02:18
  Batch     400  of      661.    Elapsed: 0:02:58. Remaining: 0:01:56
  Batch     450  of      661.    Elapsed: 0:03:20. Remaining: 0:01:34
  Batch     500  of      661.    Elapsed: 0:03:42. Remaining: 0:01:12
  Batch     550  of      661.    Elapsed: 0:04:05. Remaining: 0:00:49
  Batch     600  of      661.    Elapsed: 0:04:27. Remaining: 0:00:27
  Batch     650  of      661.    Elapsed: 0:04:49. Remaining: 0:00:05
    DONE.

Evaluation took 294 seconds.


In [27]:

total_correct = 0

# For each test sample...
for i in range(0, len(pred_start)):

    match_options = []

    # For each of the three possible answers...
    for j in range (0, len(start_positions[i])):
    
        matches = 0

        # Add a point if the start indeces match.
        if pred_start[i] == start_positions[i][j]:
            matches += 1

        # Add a point if the end indeces match.
        if pred_end[i] == end_positions[i][j]:
            matches += 1

        # Store the total.
        match_options.append(matches)

    # Between the three possible answers, pick the one with the highest "score".
    total_correct += (max(match_options))

    # ^^^ Continue looping through test samples ^^^

total_indeces = len(pred_start) + len(pred_end)

print('Correctly predicted indeces: {:,} of {:,} ({:.2%})'.format(
    total_correct,
    total_indeces,
    float(total_correct) / float(total_indeces)
))


Correctly predicted indeces: 17,729 of 21,140 (83.86%)


In [28]:
# The final F1 score for each sample.
f1s = []

# For each test sample...
for i in range(0, len(pred_start)):

    # Expand the start and end indeces into sequences of indeces stored as sets.
    # For example, if pred_start = 137 and pred_end = 140, then
    #   pred_span = {137, 138, 139, 140}
    pred_span = set(range(pred_start[i], pred_end[i] + 1))


    f1_options = []

    # For each of the three possible answers...
    for j in range (0, len(start_positions[i])):
    
        # Expand this answer into a range, as above.
        true_span = set(range(start_positions[i][j], end_positions[i][j] + 1))    

        # Use the `intersection` function from Python `set` to get the set of 
        # indeces occurring in both spans. Take the length of this resulting set
        # as the number of overlapping indeces between the two spans.
        num_same = len(pred_span.intersection(true_span))    

        # If there's no overlap, then the F1 score is 0 for this sample.
        if num_same == 0:
            f1_options.append(0)
            continue

        # Precision - How many tokens overlap relative to the total number of tokens
        #             in the predicted span? If the model predicts too large of a 
        #             span, it has bad precision.      
        precision = float(num_same) / float(len(pred_span))
    
        # Recall - How many of the correct tokens made it into the predicted span?
        #          A model could have perfect recall if it just predicted the entire
        #          paragraph as the answer :).    
        recall = float(num_same) / float(len(true_span))

        # F1 - Does the model have both good precision and good recall?
        f1 = (2 * precision * recall) / (precision + recall)

        # Store the score.
        f1_options.append(f1)

        # ^^^ Continue looping through possible answers ^^^

    # Take the highest of the three F1 scores as our score for this sample.
    f1s.append(max(f1_options))

    # ^^^ Continue looping through test samples ^^^


print('Average F1 Score: {:.3f}'.format(np.mean(f1s)))

Average F1 Score: 0.862


In [29]:
output_dir = './pretrained_model/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./pretrained_model/


('./pretrained_model/tokenizer_config.json',
 './pretrained_model/special_tokens_map.json',
 './pretrained_model/vocab.txt',
 './pretrained_model/added_tokens.json')