### Load SQuaD Dataset

In [3]:
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

--2021-05-08 15:27:21--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2021-05-08 15:27:21 (263 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2021-05-08 15:27:21--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2021-05-08 15:27:21 (60.7 MB/s) - ‘squad/dev-v2.0.json’ saved [4370528/4370528]



Install Transformers Library from Hugging face

In [4]:
!pip install transformers -q

[K     |████████████████████████████████| 2.1MB 18.9MB/s 
[K     |████████████████████████████████| 901kB 58.0MB/s 
[K     |████████████████████████████████| 3.3MB 51.6MB/s 
[?25h

### **Required Libraries**

In [5]:
import json
import pandas as pd
import numpy as np

from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForQuestionAnswering,
    AdamW)

from sklearn.model_selection import train_test_split

### **Data Preparation**

In [6]:
def read_squad(path):
  # Open JSON file and load it into dictionary
  with open(path, "rb") as f:
    data = json.load(f)

  # Intialize lits for contexts, questionsn and answers
  contexts = [] 
  questions = [] 
  answers = []

  # Iterate through all data in QA Inc Data
  for group in data["data"]:
    for passage in group["paragraphs"]:
      context = passage["context"]
      for qa in passage["qas"]:
        question = qa["question"]
        for answer in qa["answers"]:

          # Append data to list
          contexts.append(context)
          questions.append(question)
          answers.append(answer)

  # Return formatted data lists
  return contexts, questions, answers

In [7]:
train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [8]:
train_answers[3], val_answers[3]

({'answer_start': 166, 'text': 'Houston, Texas'},
 {'answer_start': 159, 'text': 'France'})

In [9]:
def add_end_idx(answers, contexts):
  # Loop through each answer-context pair
  for answer, context in zip(answers, contexts):

    # Gold_text refresr to the answer we are expecting to find in context
    gold_text = answer["text"]

    # We already know the start index
    start_idx = answer["answer_start"]

    #  And ideally this would be the end index
    end_idx = start_idx + len(gold_text)

    # However, sometimes answers are off by a character or two
    if context[start_idx:end_idx] == gold_text:
      
      #If the answer is not off
      answer["answer_end"] = end_idx
    else:
      
      # This means the answer is off by 1-2 tokens
      for n in [1, 2]:
        if context[start_idx-n:end_idx-n] == gold_text:
          answer["answer_start"] = start_idx - n
          answer["answer_end"] = end_idx - n

In [10]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [11]:
train_answers[:3]

[{'answer_end': 286, 'answer_start': 269, 'text': 'in the late 1990s'},
 {'answer_end': 226, 'answer_start': 207, 'text': 'singing and dancing'},
 {'answer_end': 530, 'answer_start': 526, 'text': '2003'}]

### **Encoding**

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [13]:
# Tokenize
train_encodings = tokenizer(train_contexts, 
                            train_questions, 
                            truncation=True, 
                            padding=True)

val_encodings = tokenizer(val_contexts, 
                          val_questions, 
                          truncation=True, 
                          padding=True)

In [15]:
def add_token_position(encodings, answers):
  
  # Initialize lists to contain the token indices of answer start/end
  start_positions = []
  end_positions = []
  for i in range(len(answers)):

    # Append start/end token position using char_to_token method
    start_positions.append(encodings.char_to_token(i, answers[i]["answer_start"]))
    end_positions.append(encodings.char_to_token(i, answers[i]["answer_end"]))

    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    # End position cannot be found, char_to_token found space, so shift position until found
    shift = 1
    while end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answers[i]["answer_end"] - shift)
      shift += 1
    
    # Update our encoding object with the new token-based start/end positions
    encodings.update({"start_positions": start_positions, "end_positions": end_positions})

In [16]:
add_token_position(train_encodings, train_answers)
add_token_position(val_encodings, val_answers)

### **Custom Dataset**

In [17]:
class SQuaDDataset(Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __len__(self):
    return len(self.encodings.input_ids)
    
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

In [18]:
train_dataset = SQuaDDataset(train_encodings)
val_dataset = SQuaDDataset(val_encodings)

### Model

In [19]:
# class BioASQModel(nn.Modeule):
#   def __init__(self):
#     super(BioASQ, self).__init__():
#     self.distibert = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
  
#   def fedforward(self, ids, mask, token_type_ids):
#     output = self.distilbert(ids, mask, token_type_ids)
#     return output
# model = BioASQModel()


model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
# Set GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model over to detected device
model.to(device)

# Activate training mode of model
model.train()

# Initialize adam optimizer with weigth decay (reduce chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# Initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

EPOCHS = 3
for epoch in range(EPOCHS):
  # Set model to train mode
  model.train()

  # Setup loop (we use tqdm for the progress bar)
  loop  = tqdm(train_loader, leave=True)
  for batch in loop:
    
    # Initialize calculated gradients (from prev step)
    optim.zero_grad()

    # Pull all the tensor batches required for training
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    start_positions = batch["start_positions"].to(device)
    end_positions = batch["end_positions"].to(device)

    # Train model on batch and return outputs (incl. loss)
    outputs = model(input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    end_positions=end_positions)
    # Extract loss
    loss = outputs[0]

    # Calculate loss for every parameter that needs grad update
    loss.backward()

    # Update parameters
    optim.step()

    # print relevant info to progress bar
    loop.set_description(f"Epoch {epoch}")
    loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 2714/2714 [1:13:57<00:00,  1.64s/it, loss=0.599]
Epoch 1:  27%|██▋       | 741/2714 [20:18<54:01,  1.64s/it, loss=1.42]

### Save Model

In [None]:
model_path = "models/distilbert-custom"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

### Load the model

In [None]:
model_path = 'models/distilbert-custom'
model = DistilBertForQuestionAnswering.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

### Measuring Performance

##### Exact Match

In [None]:
# Switch model out of training mode
model.eval()

# Initialize validation set data loader
val_loader = DataLoader(val_dataset, batch_size=16)

# Initialize list to store accuracies
acc = []

# Loop over batches
for batch in val_loader:
    
    # We don't need to calculate gradients as we're not training
    with torch.no_grad():
        
        # Pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # We will use true positions for accuracy calc
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        
        # Make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # Pull prediction tensors out and argmax to get predicted tokens
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        # Calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

# Calculate average accuracy in total
acc = sum(acc)/len(acc)

In [None]:
acc

### References:
 - [Fine-tuning with custom datasets](https://huggingface.co/transformers/custom_datasets.html?highlight=custom#qa-squad)
 - [How-to Fine-Tune a Q&A Transformer](https://towardsdatascience.com/how-to-fine-tune-a-q-a-transformer-86f91ec92997)