# BERT MODEL - Final

# 1.Load Dataset

## 1.1 Import Torch

Instructing PyTorch to use the GPU.

In [None]:
# Install necessary files
!pip install torch==1.4.0
!pip install sentencepiece
!pip install transformers==3.5.1
!pip install wget

In [None]:
# Instructing PyTorch to use the GPU.
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Current GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Releases all unoccupied cached memory 
torch.cuda.empty_cache()

There are 1 GPU(s) available.
Current GPU: Tesla T4


## 1.2 Download Dataset

In [None]:
# The dataset source: https://rajpurkar.github.io/SQuAD-explorer/
import wget
import os

# Setup local directory
print('Downloading dataset...')
local_dir = './squad_dataset/'

# The filenames and URLs for the dataset files.
files = [('train-v1.1.json', 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json'), 
         ('dev-v1.1.json', 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'),
         ('evaluate-v1.1.py', 'https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py')]

# Create directory if needed
if not os.path.exists(local_dir):
    os.mkdir(local_dir)

# Download data-files
for (filename, url) in files:
    file_path = local_dir + filename
    if not os.path.exists(file_path):
        print('  ' + file_path)
        wget.download(url, local_dir + filename)

Downloading dataset...
  ./squad_dataset/train-v1.1.json
  ./squad_dataset/dev-v1.1.json
  ./squad_dataset/evaluate-v1.1.py
Done!


In [None]:
# Printing file size and location in the drive.
data_dir = './squad_dataset/'
files = list(os.listdir(data_dir))

print('Dataset Location:', data_dir)
for f in files:
    f_size = float(os.stat(data_dir + '/' + f).st_size) / 2**20
    print("     {:25s}    {:>6.2f} MB".format(f, f_size))

Dataset Location:
./squad_dataset/
     train-v1.1.json               28.89 MB
     evaluate-v1.1.py               0.19 MB
     dev-v1.1.json                  4.63 MB


## 1.3 Parse Dataset

The SQuAD dataset is stored in 'json' format. There 87,599 training samples in the dataset.

In [None]:
# The SQuAD dataset is stored in 'json' format. 
# There 87,599 training samples in the dataset.
import json

with open(os.path.join('./squad_dataset/train-v1.1.json'), "r", encoding="utf-8") as reader:
    input_data = json.load(reader)["data"]

# List of dictionary of each row
examples = []

for entry in input_data:
    title = entry["title"] # Extract the title
    # print('  ', title)
    for paragraph in entry["paragraphs"]:
        context_text = paragraph["context"] # Extract the context
        for qa in paragraph["qas"]:
            # Store Question and answer data in dictionary
            ex = {}
            ex['qas_id'] = qa["id"]
            ex['question_text'] = qa["question"]
            answer = qa["answers"][0]
            ex['answer_text'] = answer["text"]
            ex['start_position_character'] = answer["answer_start"]                
            ex['title'] = title
            ex['context_text'] = context_text
            examples.append(ex)

   University_of_Notre_Dame
   Beyoncé
   Montana
   Genocide
   Antibiotics
   Frédéric_Chopin
   Sino-Tibetan_relations_during_the_Ming_dynasty
   IPod
   The_Legend_of_Zelda:_Twilight_Princess
   Spectre_(2015_film)
   2008_Sichuan_earthquake
   New_York_City
   To_Kill_a_Mockingbird
   Solar_energy
   Tajikistan
   Anthropology
   Portugal
   Kanye_West
   Buddhism
   American_Idol
   Dog
   2008_Summer_Olympics_torch_relay
   Alfred_North_Whitehead
   Financial_crisis_of_2007%E2%80%9308
   Saint_Barth%C3%A9lemy
   Genome
   Comprehensive_school
   Republic_of_the_Congo
   Prime_minister
   Institute_of_technology
   Wayback_Machine
   Dutch_Republic
   Symbiosis
   Canadian_Armed_Forces
   Cardinal_(Catholicism)
   Iranian_languages
   Lighting
   Separation_of_powers_under_the_United_States_Constitution
   Architecture
   Human_Development_Index
   Southern_Europe
   BBC_Television
   Arnold_Schwarzenegger
   Plymouth
   Heresy
   Warsaw_Pact
   Materialism
   Space_Race
   Pub
 

In [None]:
print('There are {:,} training examples.'.format(len(examples)))

There are 87,599 training examples.


## 1.4 Inspecting Examples:

Each example has a **question**, and a **context**, which is the reference text in which the answer can be found. 


Here are some of the field descriptions from the code:
* **qas_id**: The example's unique identifier
* **title**: Article title
* **question_text**: The question string
* **context_text**: The context string
* **answer_text**: The answer string


In [None]:
import textwrap

wrapper = textwrap.TextWrapper(width=80) 
ex = examples[260]
print('Title:', ex['title'])
print('ID:', ex['qas_id'])

print('\n======== Question =========')
print(ex['question_text'])

print('\n======== Context =========')
print(wrapper.fill(ex['context_text']))

print('\n======== Answer =========')
print(ex['answer_text'])


Title: University_of_Notre_Dame
ID: 5733ccbe4776f41900661271

In what film did a parody of the "Win one for the Gipper" speech appear?

In the film Knute Rockne, All American, Knute Rockne (played by Pat O'Brien)
delivers the famous "Win one for the Gipper" speech, at which point the
background music swells with the "Notre Dame Victory March". George Gipp was
played by Ronald Reagan, whose nickname "The Gipper" was derived from this role.
This scene was parodied in the movie Airplane! with the same background music,
only this time honoring George Zipp, one of Ted Striker's former comrades. The
song also was prominent in the movie Rudy, with Sean Astin as Daniel "Rudy"
Ruettiger, who harbored dreams of playing football at the University of Notre
Dame despite significant obstacles.

Airplane!


## 1.5 Helper Functions

In [None]:
import time
import datetime

# Helper function for formatting elapsed times.
# Converts floating point seconds into hh:mm:ss
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Helper function to automatically pick a reasonable interval for printing out a progress update during training.
# For printing updates, this will choose an interval.
def good_update_interval(total_iters, num_desired_updates):
    '''
    Progress update interval based on the magnitude of the total iterations.
    Parameters:
      `total_iters` - The number of iterations in the for-loop.
      `num_desired_updates` - How many times we want to see an update over the 
                              course of the for-loop.
    '''
    exact_interval = total_iters / num_desired_updates
    order_of_mag = len(str(total_iters)) - 1
    round_mag = order_of_mag - 1
    update_interval = int(round(exact_interval, -round_mag))
    if update_interval == 0:
        update_interval = 1
    return update_interval

import pandas as pd
import csv

# Helper function to report current GPU memory usage.
# Reports how much of the GPU's memory we're using.
def check_gpu_mem():
    '''
    Uses Nvidia's SMI tool to check the current GPU memory usage.
    '''
    buf = os.popen('nvidia-smi --query-gpu=memory.total,memory.used --format=csv')
    reader = csv.reader(buf, delimiter=',')
    df = pd.DataFrame(reader)
    new_header = df.iloc[0]
    df = df[1:]
    df.columns = new_header
    return df

# 2.Data Preprocessing




## 2.1 Import Tokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)

# Importing the tokenizer (Secondary)
from transformers import DistilBertTokenizer
tokenizer2 = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
# Distributing Sequence Length 
# Choosing max_len
max_len = 384

## 2.2 Tokenizing the training set

In [None]:
import torch

# Time
t0 = time.time()

# Lists
all_input_ids = []
attention_masks = []
segment_ids = [] 
start_positions = []
end_positions = []

num_dropped = 0

# for Update-Interval
update_interval = good_update_interval(total_iters = len(examples), num_desired_updates = 15)

print('Tokenizing {:,} examples...'.format(len(examples)))

for (ex_num, ex) in enumerate(examples):
    # Display update information
    if (ex_num % update_interval) == 0 and not (ex_num == 0):
        elapsed = format_time(time.time() - t0)
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(examples) - ex_num)
        remaining = format_time(remaining_sec)
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(examples), elapsed, remaining))
    
    answer_tokens = tokenizer.tokenize(ex['answer_text']) # Tokenize the answer
    sentinel_str = ' '.join(['[MASK]']*len(answer_tokens)) # "[MASK] [MASK] [MASK] [MASK] [MASK]"
    start_char_i = ex['start_position_character']
    end_char_i = start_char_i + len(ex['answer_text']) # Compute position of end character
    context_w_sentinel = ex['context_text'][:start_char_i] + sentinel_str + ex['context_text'][end_char_i:] # context-string with sentinel_str in position of answer
    
    # Returns a dictionary containing the encoded sequence or sequence pair and additional information: the mask for sequence classification and the overflowing elements if a max_length is specified.
    encoded_dict = tokenizer.encode_plus(
        ex['question_text'], 
        context_w_sentinel,
        add_special_tokens = True,
        max_length = max_len,
        pad_to_max_length = True,
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt')
    
    # They are token indices, numerical representations of tokens building the sequences that will be used as input by the model.
    input_ids = encoded_dict['input_ids']

    # A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT).
    is_mask_token = (input_ids[0] == tokenizer.mask_token_id)
    
    mask_token_indices = is_mask_token.nonzero(as_tuple=False)[:, 0]
    if not len(mask_token_indices) == len(answer_tokens):
        num_dropped += 1
        continue
    
    start_index = mask_token_indices[0]
    end_index = mask_token_indices[-1]
    
    # Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
    # Returns The tokenized ids of the text.
    answer_token_ids = tokenizer.encode(answer_tokens, 
                                        add_special_tokens=False, 
                                        return_tensors='pt') # Return Pytorch model
    

    input_ids[0, start_index : end_index + 1] = answer_token_ids
    
    all_input_ids.append(input_ids)
    attention_masks.append(encoded_dict['attention_mask'])    
    segment_ids.append(encoded_dict['token_type_ids'])
    start_positions.append(start_index)
    end_positions.append(end_index)

# Concatenates the given sequence of seq tensors in the given dimension. All tensors must either have the same shape (except in the concatenating dimension) or be empty.
all_input_ids = torch.cat(all_input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
segment_ids = torch.cat(segment_ids, dim=0)
# Constructs a tensor with no autograd history by copying data
start_positions = torch.tensor(start_positions)
end_positions = torch.tensor(end_positions)

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Tokenizing 87,599 examples...




  Example   6,000  of   87,599.    Elapsed: 0:00:17. Remaining: 0:03:58
  Example  12,000  of   87,599.    Elapsed: 0:00:33. Remaining: 0:03:29
  Example  18,000  of   87,599.    Elapsed: 0:00:49. Remaining: 0:03:08
  Example  24,000  of   87,599.    Elapsed: 0:01:03. Remaining: 0:02:48
  Example  30,000  of   87,599.    Elapsed: 0:01:20. Remaining: 0:02:34
  Example  36,000  of   87,599.    Elapsed: 0:01:39. Remaining: 0:02:22
  Example  42,000  of   87,599.    Elapsed: 0:02:00. Remaining: 0:02:10
  Example  48,000  of   87,599.    Elapsed: 0:02:18. Remaining: 0:01:54
  Example  54,000  of   87,599.    Elapsed: 0:02:36. Remaining: 0:01:37
  Example  60,000  of   87,599.    Elapsed: 0:02:54. Remaining: 0:01:20
  Example  66,000  of   87,599.    Elapsed: 0:03:14. Remaining: 0:01:03
  Example  72,000  of   87,599.    Elapsed: 0:03:32. Remaining: 0:00:46
  Example  78,000  of   87,599.    Elapsed: 0:03:50. Remaining: 0:00:28
  Example  84,000  of   87,599.    Elapsed: 0:04:08. Remaining: 

# 3.Fine-Tuning BERT

## 3.1 Loading Initial Weights

In [None]:
# The AlbertForQuestionAnswering class from the transformers library
from transformers import BertForQuestionAnswering, AdamW, BertConfig
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased", output_attentions = False, output_hidden_states = False)

desc = model.cuda() # .cuda() Function Can Only Specify GPU.

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

## 3.2 Sampling and Validation Set


In [None]:
# Represents a Python iterable over a dataset
from torch.utils.data import TensorDataset # Dataset wrapping tensors. Each sample will be retrieved by indexing tensors along the first dimension.
import numpy as np

subsample = True
if subsample:
  # Randomly permute a sequence
    all_indices = np.random.permutation(all_input_ids.shape[0])
    indices = all_indices[0:87000]
    dataset = TensorDataset(all_input_ids[indices, :], 
                            attention_masks[indices, :], 
                            segment_ids[indices, :], 
                            start_positions[indices], 
                            end_positions[indices])
else:
    dataset = TensorDataset(all_input_ids, 
                            attention_masks, 
                            segment_ids, 
                            start_positions, 
                            end_positions)
    
print('Dataset size: {:} samples'.format(len(dataset)))

Dataset size: 87000 samples


In [None]:
#This dataset already has a train / test split, but I'm dividing this training set to use 98% for training and 2% for validation
from torch.utils.data import random_split

train_size = int(0.98 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

85,260 training samples
1,740 validation samples


## 3.3 Batch Size and DataLoaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SubsetRandomSampler, SequentialSampler
import numpy.random
import numpy as np

batch_size = 12 
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )
print('{:,} training batches & {:,} validation batches'.format(len(train_dataloader), len(validation_dataloader)))

7,105 training batches & 145 validation batches


**Optimizer:**


In [None]:
# Optimizer with fine-tuning recommended
optimizer = AdamW(model.parameters(), lr = 3e-5,
                  eps = 1e-8
                  )

## 3.4 Epochs and Learning Rate Scheduler

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
print('Total number of steps: {}'.format(total_steps))

Total number of steps: 14210


## 3.5 Training Loop

In [None]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training {:,} batches...'.format(len(train_dataloader)))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    # Setup the update interval
    update_interval = good_update_interval(
                total_iters = len(train_dataloader), 
                num_desired_updates = 15
            )

    num_batches = len(train_dataloader)

    # iterate through each batch
    for step, batch in enumerate(train_dataloader):
        # Display the update interval
        if step % update_interval == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            step_per_sec = (time.time() - t0) / step
            remaining_sec = step_per_sec * (num_batches - step)
            remaining = format_time(remaining_sec)
            print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(step, num_batches, elapsed, remaining))

        # moves the model to the device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_seg_ids = batch[2].to(device)
        b_start_pos = batch[3].to(device)
        b_end_pos = batch[4].to(device)

        # Sets the gradients of all optimized torch.Tensor s to zero
        model.zero_grad()

        # Ouput
        outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        token_type_ids = b_seg_ids,
                        start_positions=b_start_pos,
                        end_positions=b_end_pos)
       
        # Output Tuple ( Total span extraction loss is the sum of a Cross-Entropy for the start and end positions, Span-start scores (before SoftMax) , Span-end scores (before SoftMax))
        (loss, start_logits, end_logits) = outputs

        total_train_loss += loss.item() # Returns the value of this tensor as a standard Python number. This only works for tensors with one element. For other cases, see tolist().
        loss.backward() # Computes the gradient of current tensor w.r.t. graph leaves.
        
        # Clips gradient norm of an iterable of parameters.
        # The norm is computed over all gradients together, as if they were concatenated into a single vector. Gradients are modified in-place.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 

        optimizer.step() # method that updates the parameters
        scheduler.step()
    
    # END OF INNER FOR LOOP .........................................................................................


    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    print("")
    print("Running Validation...")

    # In addition, the common practice for evaluating/validation is using torch.no_grad() in pair with model.eval() to turn off gradients computation
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0

    t0_val = time.time()
    pred_start, pred_end, true_start, true_end = [], [], [], []

    # Compute Validation Metrics
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_seg_ids = batch[2].to(device)
        b_start_pos = batch[3].to(device)
        b_end_pos = batch[4].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids, 
                            token_type_ids=b_seg_ids, 
                            attention_mask=b_input_mask,
                            start_positions=b_start_pos,
                            end_positions=b_end_pos)

        (loss, start_logits, end_logits) = outputs        

        total_eval_loss += loss.item()
        start_logits = start_logits.detach().cpu().numpy()
        end_logits = end_logits.detach().cpu().numpy()
      
        b_start_pos = b_start_pos.to('cpu').numpy()
        b_end_pos = b_end_pos.to('cpu').numpy()

        answer_start = np.argmax(start_logits, axis=1)
        answer_end = np.argmax(end_logits, axis=1)

        pred_start.append(answer_start)
        pred_end.append(answer_end)
        true_start.append(b_start_pos)
        true_end.append(b_end_pos)

    pred_start = np.concatenate(pred_start, axis=0)
    pred_end = np.concatenate(pred_end, axis=0)
    true_start = np.concatenate(true_start, axis=0)
    true_end = np.concatenate(true_end, axis=0)

    num_start_correct = np.sum(pred_start == true_start)
    num_end_correct = np.sum(pred_end == true_end)

    total_correct = num_start_correct + num_end_correct
    total_indices = len(true_start) + len(true_end)

    avg_val_accuracy = float(total_correct) / float(total_indices)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0_val)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")


Training 7,105 batches...
  Batch     500  of    7,105.    Elapsed: 0:07:49. Remaining: 1:43:14
  Batch   1,000  of    7,105.    Elapsed: 0:15:50. Remaining: 1:36:38
  Batch   1,500  of    7,105.    Elapsed: 0:23:51. Remaining: 1:29:06
  Batch   2,000  of    7,105.    Elapsed: 0:31:51. Remaining: 1:21:18
  Batch   2,500  of    7,105.    Elapsed: 0:39:52. Remaining: 1:13:26
  Batch   3,000  of    7,105.    Elapsed: 0:47:52. Remaining: 1:05:30
  Batch   3,500  of    7,105.    Elapsed: 0:55:53. Remaining: 0:57:33
  Batch   4,000  of    7,105.    Elapsed: 1:03:53. Remaining: 0:49:36
  Batch   4,500  of    7,105.    Elapsed: 1:11:54. Remaining: 0:41:37
  Batch   5,000  of    7,105.    Elapsed: 1:19:54. Remaining: 0:33:38
  Batch   5,500  of    7,105.    Elapsed: 1:27:55. Remaining: 0:25:39
  Batch   6,000  of    7,105.    Elapsed: 1:35:55. Remaining: 0:17:40
  Batch   6,500  of    7,105.    Elapsed: 1:43:56. Remaining: 0:09:40
  Batch   7,000  of    7,105.    Elapsed: 1:51:57. Remaining: 0

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/bert1.pkl')

In [None]:
torch.save(model, '/content/drive/MyDrive/bert2.pkl')

In [None]:
model_save = model 

In [None]:
model = torch.load('/content/drive/MyDrive/bert2.pkl')

# 4.Performance On Test Set

## 4.1 Load saved and pre-tuned model

In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering

pre_tuned = False

if pre_tuned:
    tokenizer = BertTokenizer.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad',
        do_lower_case=True
    )

    model = BertForQuestionAnswering.from_pretrained(
        'bert-large-uncased-whole-word-masking-finetuned-squad', 
    )
    desc = model.cuda()

## 4.2 Parsing Test Set

In [None]:
# highest F1 score that BERT gets among the three is considered
import json

with open(os.path.join('./squad_dataset/dev-v1.1.json'), "r", encoding="utf-8") as reader:
    input_data = json.load(reader)["data"]

print_count = 0
print('Unpacking SQuAD Examples...')

print('Articles:')

examples = []
for entry in input_data:
    title = entry["title"]
    print('  ', title)
    for paragraph in entry["paragraphs"]:
        context_text = paragraph["context"]
        for qa in paragraph["qas"]:
            ex = {}
            ex['qas_id'] = qa["id"]
            ex['question_text'] = qa["question"]
            ex['answers'] = qa["answers"]
            ex['title'] = title
            ex['context_text'] = context_text
            examples.append(ex)
print('DONE!')

Unpacking SQuAD Examples...
Articles:
   Super_Bowl_50
   Warsaw
   Normans
   Nikola_Tesla
   Computational_complexity_theory
   Teacher
   Martin_Luther
   Southern_California
   Sky_(United_Kingdom)
   Victoria_(Australia)
   Huguenot
   Steam_engine
   Oxygen
   1973_oil_crisis
   Apollo_program
   European_Union_law
   Amazon_rainforest
   Ctenophora
   Fresno,_California
   Packet_switching
   Black_Death
   Geology
   Newcastle_upon_Tyne
   Victoria_and_Albert_Museum
   American_Broadcasting_Company
   Genghis_Khan
   Pharmacy
   Immune_system
   Civil_disobedience
   Construction
   Private_school
   Harvard_University
   Jacksonville,_Florida
   Economic_inequality
   Doctor_Who
   University_of_Chicago
   Yuan_dynasty
   Kenya
   Intergovernmental_Panel_on_Climate_Change
   Chloroplast
   Prime_number
   Rhine
   Scottish_Parliament
   Islamism
   Imperialism
   United_Methodist_Church
   French_and_Indian_War
   Force
DONE!


In [None]:
print('There are {:,} test examples.'.format(len(examples)))

There are 10,570 test examples.


## 4.3 Locating Test Answers

In [None]:
# 2-pass approach
import time
import torch
import logging

logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

t0 = time.time()
start_positions = []
end_positions = []
num_clipped_answers = 0
num_impossible = 0

update_interval = good_update_interval(
            total_iters = len(examples), 
            num_desired_updates = 15
        )

print('Processing {:,} examples...'.format(len(examples)))

for (ex_num, ex) in enumerate(examples):

    if (ex_num % update_interval) == 0 and not (ex_num == 0):

        elapsed = format_time(time.time() - t0)
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(examples) - ex_num)
        remaining = format_time(remaining_sec)
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(examples), elapsed, remaining))
    start_options = []
    end_options = []

    encoded_stored = False
    for answer in ex['answers']:
        answer_tokens = tokenizer.tokenize(answer['text'])
        sentinel_str = ' '.join(['[MASK]']*len(answer_tokens))
        start_char_i = answer['answer_start']
        end_char_i = start_char_i + len(answer['text'])
        context_w_sentinel = ex['context_text'][:start_char_i] + \
                            sentinel_str + \
                            ex['context_text'][end_char_i:]
        input_ids = tokenizer.encode(
            ex['question_text'], 
            context_w_sentinel,
            add_special_tokens = True, 
            #max_length = max_len,
            pad_to_max_length = False,
            truncation = False,
        )
        mask_token_indices = np.where(np.array(input_ids) == tokenizer.mask_token_id)[0]
        assert(len(mask_token_indices) == len(answer_tokens))           
        start_index = mask_token_indices[0]
        end_index = mask_token_indices[-1]
        start_options.append(start_index)
        end_options.append(end_index)
    
    start_positions.append(start_options)
    end_positions.append(end_options)

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Processing 10,570 examples...
  Example   1,000  of   10,570.    Elapsed: 0:00:07. Remaining: 0:01:09
  Example   2,000  of   10,570.    Elapsed: 0:00:14. Remaining: 0:00:59
  Example   3,000  of   10,570.    Elapsed: 0:00:21. Remaining: 0:00:52
  Example   4,000  of   10,570.    Elapsed: 0:00:31. Remaining: 0:00:51
  Example   5,000  of   10,570.    Elapsed: 0:00:42. Remaining: 0:00:47
  Example   6,000  of   10,570.    Elapsed: 0:00:51. Remaining: 0:00:39
  Example   7,000  of   10,570.    Elapsed: 0:01:01. Remaining: 0:00:31
  Example   8,000  of   10,570.    Elapsed: 0:01:11. Remaining: 0:00:23
  Example   9,000  of   10,570.    Elapsed: 0:01:20. Remaining: 0:00:14
  Example  10,000  of   10,570.    Elapsed: 0:01:29. Remaining: 0:00:05
DONE.  Tokenization took 0:01:36


In [None]:
num_impossible = 0
num_clipped = 0

for (start_options, end_options) in zip(start_positions, end_positions):

    is_possible = False
    for i in range(0, len(start_options)):
        if (start_options[i] < max_len) and (end_options[i] < max_len):
            is_possible = True
        if (start_options[i] > max_len) or (end_options[i] > max_len):
            num_clipped += 1
    if not is_possible:
        num_impossible += 1

print('')

print('Samples w/ all answers clipped: {:,} of {:,} ({:.2%})'.format(num_impossible, len(examples), float(num_impossible) / float(len(examples))))

addtl_clipped = num_clipped - (num_impossible * 3)
total_answers = len(examples) * 3
print('\n    Additional clipped answers: {:,} of {:,}'.format(addtl_clipped, total_answers))


Samples w/ all answers clipped: 31 of 10,570 (0.29%)

    Additional clipped answers: 19 of 31,710


## 4.4 Tokenizing and Encoding the Test Samples

In [None]:
import time
import torch

t0 = time.time()
all_input_ids = []
attention_masks = []
segment_ids = [] 
all_input_ids2 = []
attention_masks2 = []
segment_ids2 = []
update_interval = good_update_interval(
            total_iters = len(examples), 
            num_desired_updates = 15
        )

print('Tokenizing {:,} examples...'.format(len(examples)))

for (ex_num, ex) in enumerate(examples):

    if (ex_num % update_interval) == 0 and not (ex_num == 0):
        elapsed = format_time(time.time() - t0)
        ex_per_sec = (time.time() - t0) / ex_num
        remaining_sec = ex_per_sec * (len(examples) - ex_num)
        remaining = format_time(remaining_sec)
        print('  Example {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(ex_num, len(examples), elapsed, remaining))

    encoded_dict = tokenizer.encode_plus(
        ex['question_text'], 
        ex['context_text'],
        add_special_tokens = True,
        max_length = max_len,
        pad_to_max_length = True,
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt',
    )
    encoded_dict2 = tokenizer2.encode_plus(
        ex['question_text'], 
        ex['context_text'],
        add_special_tokens = True,
        max_length = max_len,
        pad_to_max_length = True,
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt',
    )


    input_ids = encoded_dict['input_ids']
    input_ids2 = encoded_dict2['input_ids']
 
    all_input_ids.append(input_ids)
    attention_masks.append(encoded_dict['attention_mask'])    
    segment_ids.append(encoded_dict['token_type_ids'])
    all_input_ids2.append(input_ids2)
    attention_masks2.append(encoded_dict2['attention_mask'])    
    #segment_ids2.append(encoded_dict2['token_type_ids'])

all_input_ids = torch.cat(all_input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
segment_ids = torch.cat(segment_ids, dim=0)
all_input_ids2 = torch.cat(all_input_ids2, dim=0)
attention_masks2 = torch.cat(attention_masks2, dim=0)
#segment_ids2 = torch.cat(segment_ids2, dim=0)

print('DONE.  Tokenization took {:}'.format(format_time(time.time() - t0)))

Tokenizing 10,570 examples...




  Example   1,000  of   10,570.    Elapsed: 0:00:06. Remaining: 0:00:53
  Example   2,000  of   10,570.    Elapsed: 0:00:11. Remaining: 0:00:45
  Example   3,000  of   10,570.    Elapsed: 0:00:16. Remaining: 0:00:40
  Example   4,000  of   10,570.    Elapsed: 0:00:22. Remaining: 0:00:36
  Example   5,000  of   10,570.    Elapsed: 0:00:30. Remaining: 0:00:33
  Example   6,000  of   10,570.    Elapsed: 0:00:36. Remaining: 0:00:28
  Example   7,000  of   10,570.    Elapsed: 0:00:43. Remaining: 0:00:22
  Example   8,000  of   10,570.    Elapsed: 0:00:50. Remaining: 0:00:16
  Example   9,000  of   10,570.    Elapsed: 0:00:56. Remaining: 0:00:10
  Example  10,000  of   10,570.    Elapsed: 0:01:02. Remaining: 0:00:04
DONE.  Tokenization took 0:01:07


## 4.5 Evaluate On Test Set

In [None]:
the_model = torch.load('/content/drive/MyDrive/bert2.pkl')

In [None]:
# model_dis = torch.load('/content/drive/MyDrive/distil_model.pkl')
import pickle
model_dis = pickle.load(open('/content/drive/MyDrive/distil_model.pkl', 'rb'))

In [None]:
import time
import numpy as np

model.eval()
model_dis.eval()

t0 = time.time()
pred_start = []
pred_end = []
num_test_samples = all_input_ids.shape[0]
batch_size = 16

num_batches = int(np.ceil(num_test_samples / batch_size))

print('Evaluating on {:,} test batches...'.format(num_batches))

batch_num = 0
for start_i in range(0, num_test_samples, batch_size):
    if ((batch_num % 50) == 0) and not (batch_num == 0):
      elapsed = format_time(time.time() - t0)
      batches_per_sec = (time.time() - t0) / batch_num
      remaining_sec = batches_per_sec * (num_batches - batch_num)
      remaining = format_time(remaining_sec)
      print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(batch_num, num_batches, elapsed, remaining))

    end_i = min(start_i + batch_size, num_test_samples)
    b_input_ids = all_input_ids[start_i:end_i, :]
    b_attn_masks = attention_masks[start_i:end_i, :]
    b_seg_ids = segment_ids[start_i:end_i, :]   

    b_input_ids2 = all_input_ids[start_i:end_i, :]
    b_attn_masks2 = attention_masks[start_i:end_i, :]
    #b_seg_ids = segment_ids[start_i:end_i, :]

    b_input_ids = b_input_ids.to(device)
    b_attn_masks = b_attn_masks.to(device)
    b_seg_ids = b_seg_ids.to(device)

    b_input_ids2 = b_input_ids2.to(device)
    b_attn_masks2 = b_attn_masks2.to(device)
    #b_seg_ids = b_seg_ids.to(device)

    with torch.no_grad():
        (start_logits, end_logits) = model(b_input_ids, 
                                           attention_mask=b_attn_masks,
                                           token_type_ids=b_seg_ids)
        (start_logits2, end_logits2) = model_dis(b_input_ids2, 
                                           attention_mask=b_attn_masks2)
    start_logits = start_logits.detach().cpu().numpy()
    end_logits = end_logits.detach().cpu().numpy()
    start_logits2 = start_logits2.detach().cpu().numpy()
    end_logits2 = end_logits2.detach().cpu().numpy()
    
    start_logits_a, end_logits_a = (start_logits+end_logits)/2 , (start_logits2+end_logits2)/2
    
    answer_start = np.argmax(start_logits_a, axis=1)
    answer_end = np.argmax(end_logits_a, axis=1)

    pred_start.append(answer_start)
    pred_end.append(answer_end)

    batch_num += 1

pred_start = np.concatenate(pred_start, axis=0)
pred_end = np.concatenate(pred_end, axis=0)

print('    DONE.')

print('\nEvaluation took {:.0f} seconds.'.format(time.time() - t0))

Evaluating on 661 test batches...
  Batch      50  of      661.    Elapsed: 0:00:29. Remaining: 0:05:58
  Batch     100  of      661.    Elapsed: 0:01:01. Remaining: 0:05:40
  Batch     150  of      661.    Elapsed: 0:01:31. Remaining: 0:05:11
  Batch     200  of      661.    Elapsed: 0:02:02. Remaining: 0:04:41
  Batch     250  of      661.    Elapsed: 0:02:33. Remaining: 0:04:11
  Batch     300  of      661.    Elapsed: 0:03:04. Remaining: 0:03:41
  Batch     350  of      661.    Elapsed: 0:03:34. Remaining: 0:03:10
  Batch     400  of      661.    Elapsed: 0:04:05. Remaining: 0:02:40
  Batch     450  of      661.    Elapsed: 0:04:36. Remaining: 0:02:09
  Batch     500  of      661.    Elapsed: 0:05:07. Remaining: 0:01:39
  Batch     550  of      661.    Elapsed: 0:05:38. Remaining: 0:01:08
  Batch     600  of      661.    Elapsed: 0:06:09. Remaining: 0:00:37
  Batch     650  of      661.    Elapsed: 0:06:40. Remaining: 0:00:07
    DONE.

Evaluation took 406 seconds.


#5.Results

Exact Match:  Number of  predicted start and end indices that are equal to the correct ones are added up for this metric

In [1]:
total_correct = 0

for i in range(0, len(pred_start)):

    match_options = []
    for j in range (0, len(start_positions[i])):
        matches = 0
        if pred_start[i] == start_positions[i][j] or pred_start2[i] == start_positions[i][j]:
            matches += 1
        if pred_end[i] == end_positions[i][j]:
            matches += 1

        match_options.append(matches)

    total_correct += (max(match_options))
total_indices = len(pred_start) + len(pred_end)

print('Correctly predicted indeces: {:,} of {:,} ({:.2%})'.format(
    total_correct,
    total_indices,
    float(total_correct) / float(total_indices)
))

Correctly predicted indeces: 17,770 of 21,140 (84.06%)


**F1 Score**

precision = 1.0 * num_same / len(pred_toks)

recall = 1.0 * num_same / len(gold_toks)

f1 = (2 * precision * recall) / (precision + recall)



In [2]:
f1s = []
for i in range(0, len(pred_start)):
    pred_span = set(range(pred_start[i], pred_end[i] + 1))
    f1_options = []
    for j in range (0, len(start_positions[i])):
        true_span = set(range(start_positions[i][j], end_positions[i][j] + 1))    
        num_same = len(pred_span.intersection(true_span))
        if num_same == 0:
            f1_options.append(0)
            continue
        precision = float(num_same) / float(len(pred_span))
        recall = float(num_same) / float(len(true_span))
        f1 = (2 * precision * recall) / (precision + recall)
        f1_options.append(f1)
    f1s.append(max(f1_options))

print('Average F1 Score: {:.3f}'.format(np.mean(f1s)))

Average F1 Score: 0.865


**Final Score of our fine tuned BERT base model:**


Correctly predicted indeces: 17,770 of 21,140 (84.06%)

Average F1 Score: 0.865




**Score of our Pre tuned BERT base model:**


Correctly predicted indeces: 18446 of 21,140 (88.26%)

Average F1 Score: 0.884