In [None]:
# created on GCP Vertex AI Notebook using  `Python 3 (CUDA Toolkit 11.0)` environment
# using n1-standard-4 (4 vCPUS, 15 GB RAM) compute w/ 1 NVIDIA T4 GPU

# dependencies
# %pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
# %pip install transformers datasets evaluate rouge-score nltk py7zr
# nltk.download("punkt")

# note: installing an older version of pytorch so that cuda versions match
# note: py7zr is needed for the `samsum` dataset, may or may not be needed for other datasets

In [1]:
import torch
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.get_device_name()

'Tesla T4'

## Notebook Params

In [4]:
CACHE_DIR = "/home/jupyter/data/transformers"
SEED = 0
N_SAMPLES = 100
model_name = "google/flan-t5-base"
dataset_name = "samsum"

## Load Data, Tokenizer, Model, and Evaluation Metric

dialogueWill be using the `samsum` dataset, which contains text message conversations and their summarizations

https://huggingface.co/datasets/samsum

In [5]:
# loading the dataset with custom cache location
# cache location will prevent re-downloading the dataset everytime the notebook runs
dataset = load_dataset(dataset_name, cache_dir=CACHE_DIR)
dataset

Found cached dataset samsum (/home/jupyter/data/transformers/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [6]:
# using model name, we can get the appropriate tokenizer to process inputs into
# a format that the model expects
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

In [7]:
# We will be fine-tuning the `google/flan-t5-base` model using the above datase
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=CACHE_DIR)

The most commonly used metrics to evaluate summarization task is [rogue_score](https://en.wikipedia.org/wiki/ROUGE_(metric)) short for Recall-Oriented Understudy for Gisting Evaluation). This metric does not behave like the standard accuracy: it will compare a generated summary against a set of reference summaries

In [8]:
# rouge will be used to evaluate summarization
metric = evaluate.load("rouge")

## Prepare Data

Read more about padding and truncation when using the tokenizer here: \
https://huggingface.co/docs/transformers/pad_truncation

In [17]:
def preprocess(sample):
    """Tokenization function to be applied to dataset"""
    
    # t5 input requires a prompt prefix that specifies the task
    prefixed_input = ["summarize: " + item for item in sample["dialogue"]]
    
    # tokenize inputs
    # note that padding is left out here because it will be left to the data collator
    model_inputs = tokenizer(text=prefixed_input, truncation=True)
    
    # tokenizing labels using `text_target` argument
    # note that padding is left out here because it will be left to the data collator
    labels = tokenizer(text_target=sample["summary"], truncation=True)
    
    # `labels` is a required name for pytorch evaluation
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs    

In [18]:
# applying preprocess function to entire dataset
# note 1: had the tokenizers had padding=True, all observations in the dataset would have been padded/truncatd to the same length, regardless of how they are batched
# note 2: this creates new column, and the `map` method takes an arguments to remove unneeded columns
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["id", "dialogue", "summary"])
tokenized_dataset

Loading cached processed dataset at /home/jupyter/data/transformers/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-79ed9ef5fd96705e.arrow
Loading cached processed dataset at /home/jupyter/data/transformers/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-b81ee4e2b0b6277f.arrow
Loading cached processed dataset at /home/jupyter/data/transformers/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-e1c8959da63e034a.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [19]:
# giving example of how data looks raw, then tokenized, then decoded
# note again, there is no padding here
sample = tokenized_dataset["train"][25:27]

print("~~~~original inputs~~~~~")
print(dataset["train"]["dialogue"][25])

print("~~~~encoded inputs~~~~~")
print(sample["input_ids"][0])

print("~~~~decoded inputs~~~~~")
print(tokenizer.decode(sample["input_ids"][0]))

print("~~~~encoded targets~~~~~")
print(sample["labels"][0])

print("~~~~decoded inputs~~~~~")
print(tokenizer.decode(sample["labels"][0]))

print("~~~~sample length in batch~~~~~")
print([len(x) for x in sample["input_ids"]])

~~~~original inputs~~~~~
Julius: dude, your assessment of manutd
Lawrence: i have nothing to say, im so offended and hopeless of them this season
Julius: me too
Lawrence: i dont even know whats wrong with the team
Julius: the quality is there but nothing is happening
Lawrence: the players look tired of something
Julius:  with mourinhos conservative football!!
Lawrence: its so boring
Julius: so lifeless
Lawrence: man!!
Julius: it needs to change, hope the board sees it
Lawrence: sooner than later
Julius: yeah
Lawrence: yeah
~~~~encoded inputs~~~~~
[21603, 10, 9983, 302, 10, 146, 221, 6, 39, 4193, 13, 954, 4796, 26, 16617, 10, 3, 23, 43, 1327, 12, 497, 6, 256, 78, 326, 14550, 11, 897, 924, 13, 135, 48, 774, 9983, 302, 10, 140, 396, 16617, 10, 3, 23, 2483, 237, 214, 125, 7, 1786, 28, 8, 372, 9983, 302, 10, 8, 463, 19, 132, 68, 1327, 19, 4626, 16617, 10, 8, 1508, 320, 7718, 13, 424, 9983, 302, 10, 28, 3, 51, 1211, 23738, 7, 11252, 3370, 1603, 16617, 10, 165, 78, 13006, 9983, 302, 10, 78, 2

## Fine Tune

Have to create a function that performs the evaluation

In [20]:
def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred
    
    # predictions have to be decoded into tokens
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # returns a dictionary metric: score pairs
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract a few results
    result = {key: value for key, value in result.items()}
    
    # Add mean generated length, will be shown during training loop output
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [21]:
# dynamically padding the inputs for each batch, as oppose to padding to the max of the entire dataset
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model_name,
    padding=True,
    label_pad_token_id=-100 # pytorch ignores during loss when label ids are -100
)

In [22]:
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="flan-t5-base-tune/",
    per_device_train_batch_size=8, # important for avoiding OOM
    per_device_eval_batch_size=8, # important for avoiding OOM
    predict_with_generate=True,
    fp16=False, # setting to true here produces NaNs in evaluation for some reason
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [23]:
# creating smaller training and test samples to speed up training
# this is optional, though recommended to see if testing is working without errors before scaling up ot full dataset
small_train = tokenized_dataset["train"].shuffle(seed=SEED).select(range(500))
small_test = tokenized_dataset["test"].shuffle(seed=SEED).select(range(500))

Loading cached shuffled indices for dataset at /home/jupyter/data/transformers/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-7bfdb573b34d23fd.arrow
Loading cached shuffled indices for dataset at /home/jupyter/data/transformers/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-d0e26d897b4501cc.arrow


In [24]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=small_train, # replace with tokenized_dataset["train"] if want to use full dataset
    eval_dataset=small_test, # replace with tokenized_dataset["test"] if want to use full dataset
    compute_metrics=compute_metrics,
)

In [25]:
# Start training
result = trainer.train()

print(f"Time: {result.metrics['train_runtime']:.2f}")
print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.5107,1.427117,0.4597,0.2255,0.3849,0.4257,17.142
2,1.3819,1.429349,0.461,0.2247,0.3857,0.4235,17.222
3,1.2831,1.444056,0.4579,0.2223,0.3811,0.4227,17.408
4,1.2365,1.448948,0.461,0.2272,0.3861,0.4251,17.262
5,1.1968,1.452174,0.4615,0.2268,0.3847,0.4251,17.374


Time: 606.22
Samples/second: 4.12


In [28]:
# evaluating best model on the test set
trainer.evaluate()

{'eval_loss': 1.4271172285079956,
 'eval_rouge1': 0.4597,
 'eval_rouge2': 0.2255,
 'eval_rougeL': 0.3849,
 'eval_rougeLsum': 0.4257,
 'eval_gen_len': 17.142,
 'eval_runtime': 53.5001,
 'eval_samples_per_second': 9.346,
 'eval_steps_per_second': 1.178,
 'epoch': 5.0}

In [58]:
# saving the model to local directory
trainer.save_model("flan-t5-based-tuned-to-max")

# Resources

https://www.philschmid.de/fine-tune-flan-t5  
https://huggingface.co/course/chapter7/5?fw=pt