# Fine-tune T5-small on x-sum

## Libraries and environment preparation

In [2]:
#Install essential packages
! pip install datasets transformers rouge-score nltk wandb

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 4.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 59.0 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting wandb
  Downloading wandb-0.12.7-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 42.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 5.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 47.9 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████

In [3]:
#install Git-LFS
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 2s (981 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [4]:
#Colab Environment Check for GPU and RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

#GPU check
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 27.3 gigabytes of available RAM

Sat Nov 20 17:35:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [5]:
# Make sure your version of Transformers is at least 4.11.0 
# to run the following code correctly:

import transformers
print(transformers.__version__)

4.12.5


In [6]:
from transformers import AutoTokenizer    
# Huggingface Automodel class
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "t5-small"

In [7]:
# Import Wandb 
import os
import wandb
API_KEY = '39991c538626bee25c64d4f8a4c3403dd635537c'
os.environ["WANDB_API_KEY"] = API_KEY

## Loading the dataset

In [8]:
# import dataset and metrics with huggingface
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("xsum")
metric = load_metric("rouge")

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/954 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [9]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [10]:
# Visualize the Data

import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=3):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [11]:
show_random_elements(raw_datasets["train"], num_examples=4)

Unnamed: 0,document,summary,id
0,"Under the plan, 18 to 24-year-olds out of work for a year will be offered a taxpayer-funded job for six months - with those who refuse losing benefits.\nLabour says it would pay for this by taxing bank bonuses and changing tax rules for the pensions of high earners.\nThe Tories say the sums ""don't add up"", with the money already allocated.\nLabour sources say the five-year £5.5bn initiative is the party's most substantial manifesto pledge to date, coming on top of previous commitments to reverse cuts to housing benefit and to reinstate the 10p tax rate.\nBut the Conservatives are insisting that Labour has already committed to spending the proceeds of a bank bonus tax on other policies, such as building 25,000 new affordable homes, and questioned whether it can fund the move without resorting to extra borrowing.\nThere were 917,000 unemployed 16-24 year olds in the three months to December 2013, including those in full-time education but looking for work.\nUnder Labour's Compulsory Jobs Guarantee, those aged 18-25 out of work for 12 months or more would be offered 25 hours' work a week on the minimum wage and the employer would have to guarantee compulsory training.\nUp to 80% of the jobs that will be created will be in the private sector, Labour hopes.\nSpeaking during a visit to a building site in south London, Labour leader Ed Miliband said all sections of society had to benefit from the recovery in the economy.\n""We've got 56,000 young people who have been unemployed for over 12 months,"" he said.\n""That is double what it was when this government came to power. They are not taking action to help our young people and a future Labour government will.""\nShadow chancellor Mr Balls said the offer to young people would be a ""tough"" one, signalling that life on welfare would ""no longer be an option"" under a future Labour government.\n""Those who can work will be required to take up the jobs on offer or lose their benefits,"" he said. ""A life on benefits will simply not be an option.""\nThe initiative will also apply to adults aged 25 or over claiming Jobseeker's Allowance for two years or more.\nLast month, the BBC learned that Labour had committed to fund the scheme for only a single year after the 2015 election but this has been extended to the whole 2015-2020 period.\nThe party is promising to fund the bulk of the scheme in 2015-16 by repeating its one-off 2009-10 tax on bank bonuses, which it says would raise between £1.5bn and £2bn.\nIn the subsequent four years of the Parliament, it says the scheme will be paid for by restricting pensions tax relief for people earning more than £150,000 to the same rate as basic-rate taxpayers, expected to raise about £900m a year.\nLabour has been criticised for past statements in which it said it would use the proceeds of a bank bonus tax for a range of other policies as well.\nAhead of the 2011 Budget, Labour leader Ed Miliband said the bank bonus tax would pay for a £1.2bn house building scheme and £200m worth of regional economic projects as well as a £600m youth jobs fund.\nThe Conservatives said the new policy was a rehash of the Future Jobs Fund scheme introduced by Labour while it was in power but subsequently scrapped by the coalition.\nTreasury minister David Gauke said his own department's research suggested any scheme modelled on the Future Jobs Fund could cost up to £2.6bn a year.\n""This is a more expensive policy than Labour are prepared to acknowledge,"" he told BBC Radio 4's World at One. ""And I am afraid the consequence would be more borrowing.""\nBut Mr Balls said the new initiative was very different to past schemes and the Tories' figures were ""out of date"".\nAmid an argument over the cost of the scheme, the Institute for Fiscal Studies said Labour's previous bank bonus tax had been ""relatively effective"" in raising up to £2.3bn, more than was expected at the time.\nBut its director Paul Johnson said announcing a similar plan so far in advance of its implementation would enable bankers to change their arrangements to avoid the tax.","Labour says its ""jobs guarantee"" scheme for young unemployed people will last for the whole of the next parliament, if it wins the 2015 general election.",26506522
1,"Myndtown Church, in Shropshire, has secured Heritage Lottery Fund money to restore the Grade II-listed building.\nThe plans will see £20,600 spent on the development phase, which will last up to one year, with another £202,500 earmarked for construction.\nReverend Norman Morris, rector of Myndtown, welcomed the news.\n""Myndtown is one of a number of hill villages in Shropshire with tiny populations, and it's not got many parishioners, but it's got a lot of history,"" he said.\n""It's in a wonderful position, it's got potential for walkers and cyclists to use it as a kind of centre, and a lot of people really value it.""\nThe building work will provide Myndtown Church with a new roof, repairs to stonework, re-rendering of interior and exterior walls and underground drainage.\nLocal fundraisers had already raised £8,000 towards the restoration scheme.\nJohn Burt, organiser of the project, thanked the Heritage Lottery Fund for the grant.\n""Without them, it would have been impossible to raise so much money and the historic church would have been likely to close and fall into ruin,"" he said.\n""I don't think you could justify necessarily spending that kind of money to keep open a small church with very few people attending, but I think the significance of the building goes beyond that.\n""In Victorian times, when many churches were completely changed, this has been largely untouched - it's like a building from 800 years ago.""","More than £220,000 will be used to renovate a 12th Century village church, even though it has just 19 registered members.",34616409
2,"The dual Guineas winner scored by two-and-a-half-lengths from Latharnach (25-1) to give trainer Aidan O'Brien a record seventh win in the race.\nMoore went on to clock up wins aboard Clondaw Warrior and Washington DC.\nSolow led a French 1-2 in the opening Queen Anne Stakes, while Goldream held off fellow outsider Medicean Man to land the King's Stand Stakes.\nBuratino was an impressive winner of the Coventry Stakes and is rated a 12-1 chance for next year's 2,000 Guineas.\nMoore clocked up a double when winning the Ascot Stakes on Clondaw Warrior for trainer Willie Mullins, with top jump jockey Ruby Walsh among those celebrating as his wife Gillian owns the horse with a group of friends.\nAnd three-time champion Moore sealed the treble with Washington DC in the concluding Windsor Castle Stakes, bringing up a double for O'Brien.\n""He's the best in the yard at the minute and the best miler we've ever had,"" said trainer O'Brien after Gleneagles continued his unbeaten run this season.\nThe son of leading stallion Galileo showed thrilling acceleration to pick up long-time leader Consort.\nLatharnach came through to claim the runner-up spot, with Make Believe a long last of the five runners.\nBookmakers claimed they had suffered one of their worst opening days of the meeting - with Solow, Clondaw Warrior and Washington DC joining Gleneagles as winning favourites.\nSome of the edge has been taken off Wednesday's feature race, the Prince of Wales's Stakes with the late absence of the 2014 US Horse of the Year California Chrome after a setback.\nIrish challenger Free Eagle, trained by Dermot Weld, heads the field for the 10-furlong contest.\nMoore rides Cannock Chase for trainer Sir Michael Stoute, while other contenders include last year's Irish Champion Stakes and French Derby winner The Grey Gatsby.\nThe Queen seeks her 23rd Royal Ascot winner as an owner, with two horses - Touchline and Pack Together - running in the concluding Sandringham Stakes. She also has Ring Of Truth in the Queen Mary Stakes.\n14:30 BST : Jersey Stakes\n15:05: Queen Mary Stakes\n15:40: Duke of Cambridge Stakes\n16:20: Prince of Wales's Stakes\n17:00: Royal Hunt Cup\n17:35: Sandringham Stakes",Gleneagles won the St James's Palace Stakes and headed a 54-1 treble for jockey Ryan Moore at Royal Ascot.,33157428
3,"Independents, the largest group of councillors elected last week, are holding talks with others.\nThe SNP group has said a deal has been done between the independents, Liberal Democrats and Labour councillors.\nHowever, the independents have said that nothing has been agreed and negotiations were continuing.\nThe SNP has accused the independents of failing to negotiate with them on the potential of forming an administration.\nThe results of last week's local elections saw 28 independent candidates elected followed by the SNP on 22, Conservatives and Liberal Democrats with 10 each, Labour three and Scottish Greens one.\nThe Conservatives were the first to be elected to Highland Council in 22 years.\nPippa Hadley's win for the Scottish Greens was a first for the local authority, which had no representative from that party previously.\nA minority independent administration led Highland Council before the election.",Negotiations are being held on forming a coalition of councillors that would then seek to form an administration to run Highland Council.,39861733


## Preprocessing the data

In [12]:
# Import tokenizer from model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [13]:
# If you are using one of the five T5 checkpoints we have to prefix 
# the inputs with "summarize:" (t5 is a multi-task model).

if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

For Xsum, the input tokens are about 1500 and the length of the summaries are about 160. Here we truncate to 1024 and 128

In [14]:
# tokenlize inputs into map

max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/205 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 11334
    })
})

In [17]:
# Code to find out the max tokenized length
"""
max_document = 0
max_summary = 0
my_splits = ['test', 'train', 'validation']
for i in my_splits:
  for item in tokenized_datasets[i]['input_ids']:
      if len(item) > max_document:
          max_document = len(item)


  for item in tokenized_datasets[i]['labels']:
      if len(item) > max_summary:
          max_summary = len(item)

max_document, max_summary

"""

"\nmax_document = 0\nmax_summary = 0\nmy_splits = ['test', 'train', 'validation']\nfor i in my_splits:\n  for item in tokenized_datasets[i]['input_ids']:\n      if len(item) > max_document:\n          max_document = len(item)\n\n\n  for item in tokenized_datasets[i]['labels']:\n      if len(item) > max_summary:\n          max_summary = len(item)\n\nmax_document, max_summary\n\n"

## Fine-tuning the model

In [18]:
# Import tokenizer from model checkpoint and print detail
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.config

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,

In [19]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Define `Seq2SeqTrainer` to compute the metrics from the predictions, and also do a bit of pre-processing to decode the predictions into texts:

In [20]:
# Define traing args, batch size and epoch
# batch size max 8 for input length 1024 on Colab Pro

batch_size = 8
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_strategy = "epoch",
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    #push_to_hub=True,
)

In [21]:
import nltk
import numpy as np
nltk.download('punkt')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [22]:
# Pass into the trainer

train_dataset=tokenized_datasets["train"]
eval_dataset=tokenized_datasets["validation"]

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


We can now finetune our model by just calling the `train` method:

In [23]:
# keep track with wandb
wandb.init(project="T5-small")

[34m[1mwandb[0m: Currently logged in as: [33mshusunny[0m (use `wandb login --relogin` to force relogin)


In [24]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary, id.
***** Running training *****
  Num examples = 204045
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 25506
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.68,2.450021,28.7115,8.0254,22.6179,22.6134,18.8176


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary, id.
***** Running Evaluation *****
  Num examples = 11332
  Batch size = 8
Saving model checkpoint to t5-small-finetuned-xsum/checkpoint-25506
Configuration saved in t5-small-finetuned-xsum/checkpoint-25506/config.json
Model weights saved in t5-small-finetuned-xsum/checkpoint-25506/pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-xsum/checkpoint-25506/tokenizer_config.json
Special tokens file saved in t5-small-finetuned-xsum/checkpoint-25506/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=25506, training_loss=2.7389122431188304, metrics={'train_runtime': 10609.458, 'train_samples_per_second': 19.232, 'train_steps_per_second': 2.404, 'total_flos': 5.260162153729229e+16, 'train_loss': 2.7389122431188304, 'epoch': 1.0})

In [25]:
# Init new logging params
wandb.init(project="T5-small")

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▁
eval/loss,▁
eval/rouge1,▁
eval/rouge2,▁
eval/rougeL,▁
eval/rougeLsum,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/gen_len,18.8176
eval/loss,2.45002
eval/rouge1,28.7115
eval/rouge2,8.0254
eval/rougeL,22.6179
eval/rougeLsum,22.6134
eval/runtime,656.3659
eval/samples_per_second,17.265
eval/steps_per_second,2.159
train/epoch,1.0


## Trying with a smaller dataset

In [26]:
# Select to get smaller dataset
small_train = raw_datasets['train'].select(list(range(0, 5000)))
small_val = raw_datasets['validation'].select(list(range(0, 500)))
small_train

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 5000
})

In [27]:
tokenized_train = small_train.map(preprocess_function, batched=True)
tokenized_val = small_val.map(preprocess_function, batched=True)
tokenized_train

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
    num_rows: 5000
})

In [28]:
# Import a new T5-small
model_small = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "pre

In [29]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_small)

In [30]:
# Define traing args, batch size and epoch
# batch size max 16 on Colab Pro

batch_size = 8
epochs = 40
model_name = model_checkpoint.split("/")[-1]
args_small = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum-small",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_strategy = "epoch",
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    #push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [31]:
# Pass into the trainer

train_dataset=tokenized_train
eval_dataset=tokenized_val

trainer_small = Seq2SeqTrainer(
    model_small,
    args_small,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [32]:
trainer_small.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary, id.
***** Running training *****
  Num examples = 5000
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 25000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.1009,2.69048,23.2447,4.5076,18.0278,18.0057,18.734
2,2.906,2.6417,25.1824,5.4581,19.689,19.6467,18.776
3,2.8251,2.612609,25.8234,5.9366,20.0147,19.9614,18.824
4,2.7689,2.596452,26.2913,6.3642,20.604,20.5376,18.768
5,2.7401,2.584049,26.6425,6.5261,20.7916,20.768,18.782
6,2.7199,2.573628,26.6405,6.6038,20.9268,20.8756,18.82
7,2.7038,2.564651,26.9122,6.8424,21.0682,21.0388,18.788
8,2.6644,2.559441,27.12,6.7011,21.0555,21.0347,18.838
9,2.6361,2.554441,27.3211,6.9305,21.3409,21.3189,18.826
10,2.6274,2.548549,27.2393,6.9437,21.4515,21.4303,18.822


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary, id.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to t5-small-finetuned-xsum-small/checkpoint-625
Configuration saved in t5-small-finetuned-xsum-small/checkpoint-625/config.json
Model weights saved in t5-small-finetuned-xsum-small/checkpoint-625/pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-xsum-small/checkpoint-625/tokenizer_config.json
Special tokens file saved in t5-small-finetuned-xsum-small/checkpoint-625/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary, id.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to t5-small-finetuned-xsum-small/checkpoint-1250
Configuration saved i

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.1009,2.69048,23.2447,4.5076,18.0278,18.0057,18.734
2,2.906,2.6417,25.1824,5.4581,19.689,19.6467,18.776
3,2.8251,2.612609,25.8234,5.9366,20.0147,19.9614,18.824
4,2.7689,2.596452,26.2913,6.3642,20.604,20.5376,18.768
5,2.7401,2.584049,26.6425,6.5261,20.7916,20.768,18.782
6,2.7199,2.573628,26.6405,6.6038,20.9268,20.8756,18.82
7,2.7038,2.564651,26.9122,6.8424,21.0682,21.0388,18.788
8,2.6644,2.559441,27.12,6.7011,21.0555,21.0347,18.838
9,2.6361,2.554441,27.3211,6.9305,21.3409,21.3189,18.826
10,2.6274,2.548549,27.2393,6.9437,21.4515,21.4303,18.822


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary, id.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to t5-small-finetuned-xsum-small/checkpoint-11875
Configuration saved in t5-small-finetuned-xsum-small/checkpoint-11875/config.json
Model weights saved in t5-small-finetuned-xsum-small/checkpoint-11875/pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-xsum-small/checkpoint-11875/tokenizer_config.json
Special tokens file saved in t5-small-finetuned-xsum-small/checkpoint-11875/special_tokens_map.json
Deleting older checkpoint [t5-small-finetuned-xsum-small/checkpoint-10000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: document, summary, id.
***** Running Evaluation *****
  Num examples = 5

TrainOutput(global_step=25000, training_loss=2.5277778515625, metrics={'train_runtime': 11051.3765, 'train_samples_per_second': 18.097, 'train_steps_per_second': 2.262, 'total_flos': 5.14327394254848e+16, 'train_loss': 2.5277778515625, 'epoch': 40.0})

In [33]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▁▃▅▃▃▅▃▆▅▅▄▇▆▆▇▇█▆▇██▇▇▆▄▆▆▆█▆▇▇▆▇▆▆▇▇▆▆
eval/loss,█▆▅▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/rouge1,▁▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇████▇███████████████
eval/rouge2,▁▃▄▅▅▅▆▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█████████▇▇
eval/rougeL,▁▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇██████▇██████████▇
eval/rougeLsum,▁▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇██████████████████
eval/runtime,▃▆▄▃▂▆▂▁▂▁▄▃▃▅▄▃▇▇█▇▇▆▆▇▅▆▇▆▅▅▆▇▇▅▆▅▆▇▅▅
eval/samples_per_second,▆▃▅▆▇▂▇█▆█▅▆▆▄▄▆▂▂▁▂▂▃▃▂▄▃▂▃▄▃▂▂▂▄▃▃▃▂▄▃
eval/steps_per_second,▆▃▅▆▇▂▇█▆█▅▆▆▄▄▆▂▂▁▁▂▃▃▂▄▃▂▃▄▃▂▂▂▄▃▃▃▂▄▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
eval/gen_len,18.84
eval/loss,2.531
eval/rouge1,28.4441
eval/rouge2,7.7137
eval/rougeL,22.2668
eval/rougeLsum,22.2343
eval/runtime,29.8607
eval/samples_per_second,16.744
eval/steps_per_second,2.11
train/epoch,40.0


In [60]:
from transformers import T5ForConditionalGeneration

## Results of T5 small batch

In [124]:
small_test = raw_datasets['test'].select(list(range(50, 55)))
small_test

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 5
})

In [125]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

sentences = [prefix + sentence for sentence in small_test['document']] # use different length sentences to test batching
inputs = tokenizer([prefix + sentence for sentence in sentences], max_length=max_input_length, return_tensors="pt", padding=True)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


In [126]:
output_sequences = model.generate(
    input_ids=inputs['input_ids'].cuda(),
    attention_mask=inputs['attention_mask'].cuda(),
    do_sample=False, # disable sampling to test if batching affects output
)
prediction = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

In [None]:
output_sequences_small = model_small.generate(
    input_ids=inputs['input_ids'].cuda(),
    attention_mask=inputs['attention_mask'].cuda(),
    do_sample=False, # disable sampling to test if batching affects output
)
prediction_small = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

In [130]:
for i in range(5):
    print("Original Text: %s" % small_test[i]['document'])
    print("\nActual Summary: %s" % small_test[i]['summary'])
    print("\nBatch Predicted: %s" % prediction[i])
    print("\nSmall_Set Summary: %s" % prediction_small[i])
    print("=====================================================================\n")

Original Text: Mr Spencer, head of Xbox, said the console, then codenamed Project Scorpio, “must deliver true 4K gaming and high-fidelity VR [virtual reality]”.
The Xbox One X, as it will now be known, looks set to deliver on that first promise of 4K visuals. But on the second? Virtual reality didn’t get a single mention at the company’s bonanza press event on Sunday.
So what’s happening?
I think it’s fair, despite the mild protests of his PR team afterwards, to characterise Mr Spencer as something of a virtual reality sceptic - at least for now.
“It’s important to listen to your customers and what they are actually looking for,” he told me.
“I don’t get many questions about consoles and mixed reality in the living room.
"When I do this on my PC, I’m closer to my PC - that seems to be a much more user-friendly scenario today.”
What he’s saying is - the technology isn’t quite there yet, in his view. Contrast that with Sony, whose PlayStation VR has sold more than one million headsets si