# Finetune a DistilBERT model on the SQuAD Dataset

Task Description: Involves finetuning a model on QA pairs such that a model can answer particular types of questions.

Original Tutorial: https://huggingface.co/docs/transformers/tasks/question_answering

In [None]:
!pip install -q transformers datasets evaluate accelerate

# Load SQuAD dataset

In [None]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")

In [None]:
squad = squad.train_test_split(test_size=0.2)

In [None]:
# Look at the data
import pprint
pprint.pprint(squad['train'][0])

# The text column is our model input


In [None]:
# Preprocessing
## Load Model
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
# Check Model
print(model)

In [None]:
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"Requires gradients: {param.requires_grad}")
    print(f"Parameter shape: {param.shape}")
    print("=" * 30)

In [None]:
# # Optional: Specify layers to fine tune and which to freeze by setting requires grad to true and false
# layers_to_fine_tune = ['decoder.final_layer_norm.weight']

# # Freeze layers
# for name, param in model.named_parameters():
#     if not any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = False

# # Unfreeze Fine-tune layers
# for name, param in model.named_parameters():
#     if any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = True

# Preprocessing
We need to create a preprocess function that we will apply to every instance in the dataset. The preprocess function needs to:

1. Some examples in a dataset may have a very long context that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the context by setting truncation="only_second".

2. Next, map the start and end positions of the answer to the original context by setting return_offset_mapping=True.

3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the sequence_ids method to find which part of the offset corresponds to the question and which corresponds to the context.

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# Apply preprocessing over entire dataset - batched = True process multiple elements of the datasets
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

In [None]:
tokenized_squad['train'][0]

In [None]:
# Create a batch of examples, with dynamic padding. Create the appropriate collator function
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

# Evaluate

We want to create a `compute_metrics` function that monitors a metric during training. For this task, use the accuracy metric.

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
tokenized_squad_trch = tokenized_squad

tokenized_squad_trch.set_format("torch")
tokenized_squad_trch

In [None]:
tokenized_squad_trch['train'][0]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_squad_trch['train'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_squad_trch['test'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

In [None]:
import pprint
batch = next(iter(train_dataloader))

#print(batch)
print(len(train_dataloader))
print(f"input_ids batch shape: {batch['input_ids'].shape}")
print(f"attention_mask batch shape: {batch['attention_mask'].shape}")
#print(f"labels batch shape: {batch.labels.shape}")

# Train Model Using PyTorch

In [None]:
import numpy as np
from transformers import AdamW, get_scheduler
import torch

# Define Optimiser
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Define Loss Function
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(predictions, labels):
    #predictions = torch.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

# Initialize variables to track the best model
best_loss = float('inf')
best_checkpoint_path = None

# Collect Statistics
train_loss = []
train_start_pos_metrics = []
train_end_pos_metrics = []
test_start_pos_metrics = []
test_end_pos_metrics = []

## Place training on a GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

# Define Learning Rate Scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Set up a list to store checkpoints
checkpoint_paths = []

model.train()

# Training loop
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        # Set Gradients to 0
        optimizer.zero_grad()

        # Perform a forward model pass
        ## Put the batch onto a GPU
        batch = {k: v.to(device) for (k, v) in batch.items()}

        ## Forward Pass
        outputs = model(**batch)

        # Compute Loss
        loss = outputs.loss

        # Compute Metric
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_pos_predictions = torch.argmax(start_logits, dim = -1)
        end_pos_predictions = torch.argmax(end_logits, dim = -1)

        start_pos_labels = batch['start_positions']
        end_pos_labels = batch['end_positions']

        start_pos_metrics = compute_metrics(predictions= start_pos_predictions, labels=start_pos_labels)
        end_pos_metrics = compute_metrics(predictions= end_pos_predictions, labels=end_pos_labels)

        # Store Metrics
        train_loss.append(float(loss))
        train_start_pos_metrics.append(start_pos_metrics)
        train_end_pos_metrics.append(end_pos_metrics)

        # Backward pass to update parameters
        ## Compute fradients with respect to model parameters
        loss.backward()

        # Optimizer step
        ## Use the computed gradients to update the model parameters - adjust parameters in the direction that reduces the loss
        optimizer.step()

        # Update Learning Rate - according to a schedule. This adjusts learning rate dynamically
        lr_scheduler.step()

        # Print Progress
        print(f"epoch {epoch} batch_number {i} loss {loss} start_pos_metrics {start_pos_metrics} end_pos_metrics {end_pos_metrics}")

    # Save checkpoint at certain intervals
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'metrics': start_pos_metrics,
        # Add other relevant information if needed
    }
    checkpoint_path = f'checkpoint_epoch_{epoch}_batch_{i}.bin'
    torch.save(checkpoint, checkpoint_path)
    checkpoint_paths.append(checkpoint_path)

    # Update best_loss and best_checkpoint_path if needed
    if loss < best_loss:
        best_loss = loss
        best_checkpoint_path = checkpoint_path




In [None]:
# Load the best model checkpoint
best_checkpoint = torch.load(best_checkpoint_path)

model.load_state_dict(best_checkpoint['model_state_dict'])
optimizer.load_state_dict(best_checkpoint['optimizer_state_dict'])

best_epoch = best_checkpoint['epoch']
best_loss = best_checkpoint['loss']
best_metrics = best_checkpoint['metrics']


In [None]:
print(best_epoch)
print(best_loss)
print(best_metrics)

In [None]:
model.eval()

eval_start_pos_metrics = []
eval_end_pos_metrics = []
for epoch in range(num_epochs):
  for i, batch in enumerate(test_dataloader):
    # Perform a forward model pass
    ## Put the batch onto a GPU
    batch = {k: v.to(device) for (k, v) in batch.items()}

    ## Forward Pass - Set no grad because we don't want to update parameters in validation
    with torch.no_grad():
        outputs = model(**batch)

    # Compute Metric
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    start_pos_predictions = torch.argmax(start_logits, dim = -1)
    end_pos_predictions = torch.argmax(end_logits, dim = -1)

    start_pos_labels = batch['start_positions']
    end_pos_labels = batch['end_positions']

    start_pos_metrics = compute_metrics(predictions= start_pos_predictions, labels=start_pos_labels)
    end_pos_metrics = compute_metrics(predictions= end_pos_predictions, labels=end_pos_labels)

    # Store Metrics
    eval_start_pos_metrics.append(start_pos_metrics)
    eval_end_pos_metrics.append(end_pos_metrics)

    # Print Progress
    print(f"epoch {epoch} batch_number {i} start_pos_metrics {start_pos_metrics} end_pos_metrics {end_pos_metrics}")


# Save and Load Tokenizer and Model

In [None]:
# Suggested from Docs: https://huggingface.co/transformers/v1.2.0/serialization.html
# Save Tokenizer and Model
import os

output_dir = "./squad_qa_model/"
os.makedirs(output_dir, exist_ok=True)

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
WEIGHTS_NAME = "pytorch_model.bin"
CONFIG_NAME = "config.json"
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)

In [None]:
# Load Model
pretrained_loaded_model = AutoModelForQuestionAnswering.from_pretrained("squad_qa_model")

# Inference

Use model for inference using PyTorch

In [None]:
question = "When was Pakistan founded?"
context = "Pakistan was founded in 1948. It is a country based in South Asia with neighbours such as Iran and Afghanistan. It's majority population follows the religion of Islam."

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("squad_qa_model")
inputs = tokenizer(question, context, return_tensors="pt")

# Load Model
pretrained_loaded_model = AutoModelForQuestionAnswering.from_pretrained("squad_qa_model")

with torch.no_grad():
    outputs = pretrained_loaded_model(**inputs)


In [None]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [None]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)