# Finetune a DistilRoBERTa model on the ELI5 Dataset

Task Description: Masked Language Modelling (MLM) predicts a masked token in a sequence. If we give it a sequence where some tokens inside are masked, it can predict the masked tokens.

Original Tutorial: https://huggingface.co/docs/transformers/tasks/masked_language_modeling

In [None]:
!pip install -q transformers datasets evaluate accelerate

# Load ELI5 dataset

In [None]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split = "train_asks[:5000]")

In [None]:
# Split the dataset into a train and test set
eli5 = eli5.train_test_split(test_size=0.2)

In [None]:
# Look at the data
import pprint
pprint.pprint(eli5['train'][0])

# The text column is our model input


In [None]:
# Preprocessing
## Load Model
from transformers import AutoTokenizer, AutoModelForMaskedLM

model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
# Check Model
print(model)

In [None]:
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"Requires gradients: {param.requires_grad}")
    print(f"Parameter shape: {param.shape}")
    print("=" * 30)

In [None]:
# # Optional: Specify layers to fine tune and which to freeze by setting requires grad to true and false
# layers_to_fine_tune = ['decoder.final_layer_norm.weight']

# # Freeze layers
# for name, param in model.named_parameters():
#     if not any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = False

# # Unfreeze Fine-tune layers
# for name, param in model.named_parameters():
#     if any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = True

# Preprocessing
We need to create a preprocess function that we will apply to every instance in the dataset. The preprocess function needs to:

1. Flatten the instance so that the text column is easily accessible
2. Join any list of strings
3. Tokenize result

Some token sequences will be **longer** than the maximum input length for the model. Hence we use a second preprocessing function to:

1. concatenate all token sequences
2. Split the concatenated sequences into shorter chunks defined by a `block_size` parameter.

In [None]:
# The text field is nested so we need to flatten each instance
eli5 = eli5.flatten()
pprint.pprint(eli5['train'][0])

In [None]:
# Operation to apply to every instance
print(" ".join(eli5['train']['answers.text'][0]), "\n")
print(tokenizer(" ".join(eli5['train']['answers.text'][0])) )

In [None]:
# Wrap in a preprocess function
def preprocess_function(examples):
  return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [None]:
# Apply preprocessing over entire dataset - batched = True process multiple elements of the datasets
tokenized_eli5 = eli5.map(preprocess_function, batched = True, num_proc=4, remove_columns=eli5['train'].column_names)

In [None]:
def group_texts(examples, block_size: int = 128):
  # This function is to cut the length of the text examples

  # Concatencate all texts
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])

  if total_length >= block_size:
    total_length = (total_length // block_size) * block_size
  # Split by chunks of block size
  result = {
      k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
      for k, t in concatenated_examples.items()
  }
  result["labels"] = result["input_ids"].copy()
  return result


In [None]:
# Apply second preprocessing over entire dataset
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

In [None]:
pprint.pprint(lm_dataset['train'][0])

In [None]:
# Create a batch of examples, with dynamic padding. Use the appropriate collator function
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = 0.15)

# Evaluate

We want to create a `compute_metrics` function that monitors a metruc during training.

In [None]:
!pip install evaluate

In [None]:
tokenized_eli5_trch = lm_dataset

tokenized_eli5_trch.set_format("torch")
tokenized_eli5_trch

In [None]:
tokenized_eli5_trch['train'][0]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_eli5_trch['train'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_eli5_trch['test'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

In [None]:
import pprint
batch = next(iter(train_dataloader))

#print(batch)
print(len(train_dataloader))
print(f"input_ids batch shape: {batch.input_ids.shape}")
print(f"attention_mask batch shape: {batch.attention_mask.shape}")
print(f"labels batch shape: {batch.labels.shape}")

In [None]:
import evaluate

metric = evaluate.load("perplexity")

# Train Model Using PyTorch

In [None]:
import numpy as np
from transformers import AdamW, get_scheduler
import torch

# Define Optimiser
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Define Loss Function

# def compute_metrics(predictions, labels):

#     if isinstance(predictions, tuple):
#         predictions = predictions[0]
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

#     labels = torch.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

#     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
#     result = {"bleu": result["score"]}

#     prediction_lens = [torch.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
#     result["gen_len"] = torch.stack(prediction_lens).float().mean()
#     result = {k: v for k, v in result.items()}
#     return result

# Initialize variables to track the best model
best_loss = float('inf')
best_checkpoint_path = None

# Collect Statistics
train_loss = []
train_metrics = []
test_metrics = []

## Place training on a GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

# Define Learning Rate Scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Set up a list to store checkpoints
checkpoint_paths = []

model.train()

# Training loop
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        # Set Gradients to 0
        optimizer.zero_grad()

        # Perform a forward model pass
        ## Put the batch onto a GPU
        batch = {k: v.to(device) for (k, v) in batch.items()}

        ## Forward Pass
        outputs = model(**batch)

        # Compute Loss
        loss = outputs.loss

        # Compute Metric
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = batch["labels"]

        #metrics = compute_metrics(predictions=predictions, labels=labels)

        # Store Metrics
        train_loss.append(float(loss))
        #train_metrics.append(metrics)

        # Backward pass to update parameters
        ## Compute fradients with respect to model parameters
        loss.backward()

        # Optimizer step
        ## Use the computed gradients to update the model parameters - adjust parameters in the direction that reduces the loss
        optimizer.step()

        # Update Learning Rate - according to a schedule. This adjusts learning rate dynamically
        lr_scheduler.step()

        # Print Progress
        #print(f"epoch {epoch} batch_number {i} loss {loss} metrics {metrics}")
        print(f"epoch {epoch} batch_number {i} loss {loss}")

    # Save checkpoint at certain intervals
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        #'metrics': metrics,
        # Add other relevant information if needed
    }
    checkpoint_path = f'checkpoint_epoch_{epoch}_batch_{i}.bin'
    torch.save(checkpoint, checkpoint_path)
    checkpoint_paths.append(checkpoint_path)

    # Update best_loss and best_checkpoint_path if needed
    if loss < best_loss:
        best_loss = loss
        best_checkpoint_path = checkpoint_path




In [None]:
# Load the best model checkpoint
best_checkpoint = torch.load(best_checkpoint_path)

model.load_state_dict(best_checkpoint['model_state_dict'])
optimizer.load_state_dict(best_checkpoint['optimizer_state_dict'])

best_epoch = best_checkpoint['epoch']
best_loss = best_checkpoint['loss']
#best_metrics = best_checkpoint['metrics']


In [None]:
print(best_epoch)
print(best_loss)
#print(best_metrics)

In [None]:
model.eval()

eval_metrics = []
for epoch in range(num_epochs):
  for i, batch in enumerate(test_dataloader):
    # Perform a forward model pass
    ## Put the batch onto a GPU
    batch = {k: v.to(device) for (k, v) in batch.items()}

    ## Forward Pass - Set no grad because we don't want to update parameters in validation
    with torch.no_grad():
        outputs = model(**batch)

    # Compute Metric
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    labels = batch["labels"]

    #metrics = compute_metrics(predictions=predictions, labels=labels)

    # Store Metrics
    #eval_metrics.append(metrics)

    # Print Progress
    #print(f"epoch {epoch} batch_number {i} metrics {metrics}")
    print(f"epoch {epoch} batch_number {i}")


# Save and Load Tokenizer and Model

In [None]:
# Suggested from Docs: https://huggingface.co/transformers/v1.2.0/serialization.html
# Save Tokenizer and Model
import os

output_dir = "./eli5_mlm/"
os.makedirs(output_dir, exist_ok=True)

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
WEIGHTS_NAME = "pytorch_model.bin"
CONFIG_NAME = "config.json"
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)

# Inference


In [None]:
text = "Pakistan is a country that is in the continent of <mask>"

In [None]:
# Inference Pipeline using Pytorch
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("eli5_mlm")
inputs = tokenizer(text, return_tensors="pt").input_ids
mask_token_index = torch.where(inputs == tokenizer.mask_token_id)[1].item()

model = AutoModelForMaskedLM.from_pretrained("eli5_mlm")
logits = model(inputs).logits
mask_token_logits = logits[0, mask_token_index, :]


In [None]:
top_3_tokens = torch.topk(logits[0, mask_token_index, :], 3, dim=0).indices.tolist()

for token in top_3_tokens:
  print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))