# Text summarization with T5 on XSum

We are going to fine-tune the [T5 model, implemented by HuggingFace](https://huggingface.co/t5-small), for text summarization on the [Extreme Summarization (XSum)](https://huggingface.co/datasets/xsum) dataset.
The data is composed by news articles and the corresponding summaries.

We will be using the following model sizes available from HuggingFace

| Variant                                     |   Parameters    |
|:-------------------------------------------:|----------------:|
| [T5-small](https://huggingface.co/t5-small) |    60,506,624   | 
| [T5-large](https://huggingface.co/t5-large) |   737,668,096   | 
| [T5-3b](https://huggingface.co/t5-3b)       | 2,851,598,336   | 


More info:
* This notebooks is based on the script [run_summarization_no_trainer.py](https://github.com/huggingface/transformers/blob/v4.12.5/examples/pytorch/summarization/run_summarization_no_trainer.py) from HuggingFace
* [T5 on HuggingFace docs](https://huggingface.co/transformers/model_doc/t5.html)

In [1]:
import os
import datasets
import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm
2022-10-13 13:04:40.762304: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-13 13:04:40.934954: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-13 13:04:42.119263: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /apps/daint/UES/6.0.UP04/sandboxes/sarafael/software/cuDNN/8.1.0/lib:/opt/nvidia/hpc_sdk/Linux_x86_64/21.5/cuda/11.3/compat:/usr/local/cuda-11.3/compat:/opt/nvidi

In [2]:
from datasets.utils import disable_progress_bar
from datasets import disable_caching


disable_progress_bar()
disable_caching()

In [3]:
hf_model = 't5-large'
t5_cache = os.path.join(os.getcwd(), 'cache')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    hf_model,
    use_fast=True,
    cache_dir=os.path.join(t5_cache, f'{hf_model}_tokenizer')
)

Downloading: 100%|██████████| 1.17k/1.17k [00:00<00:00, 1.48MB/s]
Downloading: 100%|██████████| 773k/773k [00:00<00:00, 1.57MB/s]
Downloading: 100%|██████████| 1.32M/1.32M [00:00<00:00, 1.53MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    hf_model,
    cache_dir=os.path.join(t5_cache, f'{hf_model}_model')
)

Downloading: 100%|██████████| 1.17k/1.17k [00:00<00:00, 1.27MB/s]
Downloading: 100%|██████████| 2.75G/2.75G [01:20<00:00, 36.4MB/s]


In [6]:
parameters = filter(lambda p: p.requires_grad, model.parameters())
num_params = sum([np.prod(p.size()) for p in parameters])
print(f'{num_params:,} parameters\n')

parameters = filter(lambda p: p.requires_grad, model.parameters())

737,668,096 parameters



In [7]:
hf_dataset = load_dataset('xsum')

Using custom data configuration default
Reusing dataset xsum (/users/class424/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


In [8]:
def preprocess_function(examples):    
    inputs = examples['document']
    targets = examples['summary']
    inputs = [f'summarize: {inp}' for inp in inputs]

    model_inputs = tokenizer(inputs, max_length=1024,
                             padding=False, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128,
                           padding=False, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
processed_datasets = hf_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=hf_dataset["train"].column_names,
    num_proc=12
)

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=4
)

eval_dataloader = DataLoader(
    eval_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=1
)

In [11]:
no_decay = ["bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters()
                   if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters()
                   if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

In [12]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)

In [13]:
device = 0
model.to(device)

model.train()
model.training

True

In [14]:
for step, batch in enumerate(train_dataloader):  
    optimizer.zero_grad()
    outputs = model(**batch.to(device))
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    
    # stop after 10 steps for the demo:
    if step > 100:
        break

RuntimeError: CUDA out of memory. Tried to allocate 248.00 MiB (GPU 0; 15.90 GiB total capacity; 14.73 GiB already allocated; 223.75 MiB free; 14.86 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Evaluation

In [None]:
# only to print with style
from rich import print as pprint
from rich.console import Console

In [None]:
model.eval()

gen_kwargs = {
    "max_length": 128,
    "num_beams": None,
}
for step, batch in enumerate(eval_dataloader):
    if step > 10:
        break

    with torch.no_grad():
        generated_tokens = model.generate(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
        )

        labels = batch["labels"]
        generated_tokens = generated_tokens.cpu().numpy()
        
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        pprint(':page_facing_up:', tokenizer.batch_decode(batch["input_ids"])[0])
        pprint(':robot_face:', decoded_preds[0])
        pprint(':white_check_mark:', decoded_labels[0])
        Console().rule(style='black')