# Fine-tune T5-small on CNN

## Libraries and environment preparation

In [1]:
#Install essential packages
%%capture
! pip install datasets transformers rouge-score nltk wandb
!apt install git-lfs

In [2]:
#Colab Environment Check for GPU and RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

#GPU check
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 27.3 gigabytes of available RAM

Fri Feb  4 00:43:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [3]:
# Make sure your version of Transformers is at least 4.11.0 
# to run the following code correctly:
import transformers
import datasets
print(transformers.__version__)

4.16.2


In [4]:
# Import Wandb 
import os
import wandb
API_KEY = '39991c538626bee25c64d4f8a4c3403dd635537c'
os.environ["WANDB_API_KEY"] = API_KEY

## Loading the dataset and process

In [5]:
raw_datasets = datasets.load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [7]:
model_checkpoint = "t5-small"
from transformers import T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained("t5-small")

In [8]:
# If you are using one of the five T5 checkpoints we have to prefix 
# the inputs with "summarize:" (t5 is a multi-task model).

if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "

In [9]:
# tokenlize inputs into map

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-050c2bd6f70abd9a.arrow


  0%|          | 0/14 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-48bc424f15731890.arrow


In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

## Fine-tuning the model

In [12]:
# Import Huggingface Automodel class from model checkpoint and print details

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [13]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [14]:
# keep track with wandb
wandb.init(project="T5-small-cnn")

[34m[1mwandb[0m: Currently logged in as: [33mshusunny[0m (use `wandb login --relogin` to force relogin)


Define `Seq2SeqTrainer` to compute the metrics from the predictions, and also do a bit of pre-processing to decode the predictions into texts:

In [15]:
# Define compute_metrics
import nltk
import numpy as np
nltk.download('punkt')

metric = datasets.load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
# Define traing args, batch size and epoch
# batch size max 8 for input length 1024 on Colab Pro

batch_size = 16
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-cnn",
    load_best_model_at_end="eval_loss",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-3,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    logging_steps=1000,  # set to 1000 for full training
    save_steps=1250,  # set to 500 for full training
    eval_steps=1250,  # set to 8000 for full training
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

In [17]:
# Pass into the trainer

train_dataset=tokenized_datasets["train"]
eval_dataset=tokenized_datasets["validation"]

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


We can now finetune our model by just calling the `train` method:

In [18]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights.
***** Running training *****
  Num examples = 287113
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 17945
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1250,2.1964,1.936945,24.1122,11.2715,19.8766,22.6875,18.9999
2500,2.1643,1.919584,24.0384,11.3177,19.8258,22.7463,18.9999
3750,2.1626,1.89103,24.1466,11.3331,19.8913,22.7995,18.9999
5000,2.1155,1.882262,24.0329,11.3216,19.8389,22.6311,19.0
6250,2.103,1.866311,24.2303,11.395,19.9275,22.8117,18.9997
7500,2.0793,1.851798,24.4045,11.5897,20.1026,22.9898,18.9999
8750,2.0774,1.835055,24.3567,11.5969,20.0902,22.9236,19.0
10000,2.0545,1.828629,24.3185,11.5127,20.0527,22.954,19.0
11250,2.0417,1.825096,24.2362,11.6083,20.0507,22.8681,19.0
12500,2.0332,1.802971,24.3926,11.6255,20.1828,23.0191,18.9999


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights.
***** Running Evaluation *****
  Num examples = 13368
  Batch size = 16
Saving model checkpoint to t5-small-finetuned-cnn/checkpoint-1250
Configuration saved in t5-small-finetuned-cnn/checkpoint-1250/config.json
Model weights saved in t5-small-finetuned-cnn/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-cnn/checkpoint-1250/tokenizer_config.json
Special tokens file saved in t5-small-finetuned-cnn/checkpoint-1250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights.
***** Running Evaluation *****
  Num examples = 13368
  Batch size = 16
Saving model checkpoint to t5-small-finetuned-cnn/checkpoint-2500
Configuration saved in t5-small-finetuned-cnn/ch

TrainOutput(global_step=17945, training_loss=2.0680589303361834, metrics={'train_runtime': 13714.6917, 'train_samples_per_second': 20.935, 'train_steps_per_second': 1.308, 'total_flos': 3.885839064603034e+16, 'train_loss': 2.0680589303361834, 'epoch': 1.0})

In [20]:
wandb.finish()

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
!ls t5-small-finetuned-cnn/

checkpoint-15000  checkpoint-16250  checkpoint-17500


In [24]:
!zip -r /content/t5-small-finetuned-cnn.zip /content/t5-small-finetuned-cnn/checkpoint-17500/

  adding: content/t5-small-finetuned-cnn/checkpoint-17500/ (stored 0%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/scheduler.pt (deflated 49%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/training_args.bin (deflated 49%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/scaler.pt (deflated 55%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/tokenizer_config.json (deflated 80%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/special_tokens_map.json (deflated 83%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/trainer_state.json (deflated 80%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/config.json (deflated 62%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/optimizer.pt (deflated 7%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/tokenizer.json (deflated 74%)
  adding: content/t5-small-finetuned-cnn/checkpoint-17500/rng_state.pth (deflated 27%)
  adding: content/t5-small-finetuned-cnn

In [25]:
!cp t5-small-finetuned-cnn.zip '/content/drive/My Drive/weights/'