# Finetune a T5 model to translate from English to French

Task Description: We will finetune the Google T5 transformer model to translate from English to French. The dataset used is a subset of the OPUS Books.

Original Tutorial: https://huggingface.co/docs/transformers/tasks/translation

In [None]:
!pip install -qqq transformers datasets evaluate accelerate

# Load the OPUS Books Dataset

In [None]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-fr", split = 'train[0:5000]')

In [None]:
books

In [None]:
# Split the dataset into a train and test set
books = books.train_test_split(test_size=0.2)

In [None]:
# Look at the data
import pprint
pprint.pprint(books['train'][1])

# The translation column is our model input


In [None]:
# Preprocessing
## Load Model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# Check Model
print(model)

In [None]:
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"Requires gradients: {param.requires_grad}")
    print(f"Parameter shape: {param.shape}")
    print("=" * 30)

In [None]:
# # Optional: Specify layers to fine tune and which to freeze by setting requires grad to true and false
# layers_to_fine_tune = ['decoder.final_layer_norm.weight']

# # Freeze layers
# for name, param in model.named_parameters():
#     if not any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = False

# # Unfreeze Fine-tune layers
# for name, param in model.named_parameters():
#     if any(layer_name in name for layer_name in layers_to_fine_tune):
#         param.requires_grad = True

# Preprocessing

We need to create a preprocessing function that performs the following operations:

1. Prefix the input with a prompt so T5 knows this is a translation task. Some models capable of multiple NLP tasks require prompting for specific tasks.

2. Tokenize the input (English) and target (French) separately because you can’t tokenize French text with a tokenizer pretrained on an English vocabulary.

3. Truncate sequences to be no longer than the maximum length set by the max_length parameter.

In [None]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "

def preprocess_function(examples):
  inputs = [prefix + example[source_lang] for example in examples["translation"]]
  targets = [example[target_lang] for example in examples['translation']]
  model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
  return model_inputs

In [None]:
# Operation to apply to every instance
tokenized_books = books.map(preprocess_function, batched=True)

In [None]:
# Create a batch of examples, with dynamic padding. Use the appropriate collator function
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)

# Evaluate

We want to create a `compute_metrics` function that monitors a metric during training. For this task, use the SacreBLEU metric.

In [None]:
!pip install sacrebleu

In [None]:
tokenized_books_trch = tokenized_books.remove_columns(['id', 'translation'])

tokenized_books_trch.set_format("torch")
tokenized_books_trch


In [None]:
tokenized_books_trch['train'][0]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_books_trch['train'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_books_trch['test'], shuffle = True, batch_size = 16, collate_fn=data_collator
)

In [None]:
import pprint
batch = next(iter(train_dataloader))

#print(batch)
print(len(train_dataloader))
print(f"input_ids batch shape: {batch.input_ids.shape}")
print(f"attention_mask batch shape: {batch.attention_mask.shape}")
print(f"labels batch shape: {batch.labels.shape}")

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

# Train Model Using PyTorch

In [None]:
import numpy as np
from transformers import AdamW, get_scheduler
import torch

# Define Optimiser
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Define Loss Function
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(predictions, labels):

    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = torch.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [torch.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = torch.stack(prediction_lens).float().mean()
    result = {k: v for k, v in result.items()}
    return result

# Initialize variables to track the best model
best_loss = float('inf')
best_checkpoint_path = None

# Collect Statistics
train_loss = []
train_metrics = []
test_metrics = []

## Place training on a GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

# Define Learning Rate Scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Set up a list to store checkpoints
checkpoint_paths = []

model.train()

# Training loop
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        # Set Gradients to 0
        optimizer.zero_grad()

        # Perform a forward model pass
        ## Put the batch onto a GPU
        batch = {k: v.to(device) for (k, v) in batch.items()}

        ## Forward Pass
        outputs = model(**batch)

        # Compute Loss
        loss = outputs.loss

        # Compute Metric
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = batch["labels"]

        metrics = compute_metrics(predictions=predictions, labels=labels)

        # Store Metrics
        train_loss.append(float(loss))
        train_metrics.append(metrics)

        # Backward pass to update parameters
        ## Compute fradients with respect to model parameters
        loss.backward()

        # Optimizer step
        ## Use the computed gradients to update the model parameters - adjust parameters in the direction that reduces the loss
        optimizer.step()

        # Update Learning Rate - according to a schedule. This adjusts learning rate dynamically
        lr_scheduler.step()

        # Print Progress
        print(f"epoch {epoch} batch_number {i} loss {loss} metrics {metrics}")

    # Save checkpoint at certain intervals
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'metrics': metrics,
        # Add other relevant information if needed
    }
    checkpoint_path = f'checkpoint_epoch_{epoch}_batch_{i}.bin'
    torch.save(checkpoint, checkpoint_path)
    checkpoint_paths.append(checkpoint_path)

    # Update best_loss and best_checkpoint_path if needed
    if loss < best_loss:
        best_loss = loss
        best_checkpoint_path = checkpoint_path




In [None]:
# Load the best model checkpoint
best_checkpoint = torch.load(best_checkpoint_path)

model.load_state_dict(best_checkpoint['model_state_dict'])
optimizer.load_state_dict(best_checkpoint['optimizer_state_dict'])

best_epoch = best_checkpoint['epoch']
best_loss = best_checkpoint['loss']
best_metrics = best_checkpoint['metrics']


In [None]:
print(best_epoch)
print(best_loss)
print(best_metrics)

In [None]:
model.eval()

eval_metrics = []
for epoch in range(num_epochs):
  for i, batch in enumerate(test_dataloader):
    # Perform a forward model pass
    ## Put the batch onto a GPU
    batch = {k: v.to(device) for (k, v) in batch.items()}

    ## Forward Pass - Set no grad because we don't want to update parameters in validation
    with torch.no_grad():
        outputs = model(**batch)

    # Compute Metric
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    labels = batch["labels"]

    metrics = compute_metrics(predictions=predictions, labels=labels)

    # Store Metrics
    eval_metrics.append(metrics)

    # Print Progress
    print(f"epoch {epoch} batch_number {i} metrics {metrics}")


# Save and Load Tokenizer and Model

In [None]:
# Suggested from Docs: https://huggingface.co/transformers/v1.2.0/serialization.html
# Save Tokenizer and Model
import os

output_dir = "./eng_fr_translation_model/"
os.makedirs(output_dir, exist_ok=True)

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned

# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
WEIGHTS_NAME = "pytorch_model.bin"
CONFIG_NAME = "config.json"
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)

In [None]:
# Load Model
pretrained_loaded_model = AutoModelForSeq2SeqLM.from_pretrained("eng_fr_translation_model")

# Inference

In [None]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [None]:
# Inference
## Tokenize inputs
inputs = tokenizer(text, return_tensors="pt").input_ids
print(inputs)

In [None]:
outputs = pretrained_loaded_model.generate(inputs,
                                           max_new_tokens=35,
                                           do_sample=False,
                                           )
print(outputs)

In [None]:
import pprint
original_text = {"original_text": text}
summary_text = {"summary_text": tokenizer.decode(outputs[0],
                                                 skip_special_tokens=True
                                                 )}

pprint.pprint(original_text)
pprint.pprint(summary_text)