# Fine-tune T5 on CNN/Daily

## Libraries and environment preparation

In [None]:
#Install essential packages
%%capture
! pip install datasets transformers rouge-score nltk wandb

In [None]:
#install Git-LFS
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [None]:
#Colab Environment Check for GPU and RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

#GPU check
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 13.6 gigabytes of available RAM

Sun Nov 28 00:49:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [None]:
# Make sure your version of Transformers is at least 4.11.0 
# to run the following code correctly:
import datasets
import transformers
print(transformers.__version__)

4.12.5


In [None]:
# Import Wandb 
import os
import wandb
API_KEY = '39991c538626bee25c64d4f8a4c3403dd635537c'
os.environ["WANDB_API_KEY"] = API_KEY

## Loading the dataset

In [None]:
# import dataset huggingface
raw_datasets = datasets.load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [None]:
# Visualize the Data

import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=3):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    del df["id"]
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(raw_datasets["train"])

Unnamed: 0,article,highlights
0,"By . Chris Pleasance . PUBLISHED: . 13:41 EST, 2 December 2013 . | . UPDATED: . 15:19 EST, 2 December 2013 . Police Constable Gareth Browning is in critical but stable condition in hospital after allegedly being run over while trying to stop a stolen car . Three people have been arrested after a police officer was allegedly hit by a stolen car leaving him in critical condition. PC Gareth Browning was his by a black Mazda Areosport as he tried to stop the car at around 5.30pm on Saturday. One man, 28, and a 31-year-old woman have been arrested today on suspicion of attempted murder. A third suspect, a 34-year-old woman, has been detained on suspicion of assisting an offender. All three, from Reading, are in custody. PC Browning is in a critical but stable condition in the John Radcliffe Hospital, Oxford. His family, thought to be from Thatcham, Berkshire, are at his bedside. Browning and his colleague were on foot patrol on Lower Early Way in Reading on Saturday when they saw the car, which had been stolen from a house 10 days earlier, speeding around a residential area. They tried to stop it by throwing a stop stick, similar to a stinger, into the road but the car swerved onto the pavement and hit Browning so hard he was thrown into the air. He is believed to have landed on his his head, causing severe injuries. As his colleague ran to help the driver avoided the stinger, escaped, and the vehicle was found abandoned a short time later. Thames Valley Police’s Assistant Chief Constable, John Campbell, said: 'That officer went on duty that afternoon with the sole purpose of protecting the public and he’s ended up in a very seriously ill condition in hospital. 'His family are very shocked by what has gone on and we are offering them all the support we can. According to police a stolen car similar to this one was used to ram PC Browning during the attack . 'His colleagues are also shocked and upset by what has happened. Professionally, they are continuing with the job as we always do when there are tragic consequences. It's had an impact but we carry on as you would expect.' PC Browning works at Loddon Valley Police Station where he has served for 10 years. The 07-registered Mazda was stolen during a burglary in the affluent area of Sonning Common on November 21. Thieves broke into the property and took the keys before making off with the car. Police are still appealing for witnesses anybody with information is asked to call 101 or Crimestoppers anonymously on 0800 555 111.","PC Gareth Browning was hit while on foot patrol last Saturday .\nHe was trying to stop a stolen black Mazda using a stinger .\nHowever the car swerved onto the pavement and ran into him .\nHe is believed to have landed on his head causing severe injuries .\nThe driver fled and the car was found abandoned later .\nMan, 28, and woman, 31, arrested on suspicion of attempted murder .\nWoman, 34, has been arrested on suspicion of assisting an offender ."
1,"By . Phil Vinter . PUBLISHED: . 07:45 EST, 30 May 2012 . | . UPDATED: . 14:50 EST, 30 May 2012 . When dieter Melanie Swan-Horton saw her weight plummet from a size 16 to a size ten she was thrilled. She finally had a body to be proud of and put her tiredness and lethargy down to the hours spent in the gym. But her exhaustion and her weight loss were in fact signs of bowel cancer. Dramatic: Melanie Swan-Horton went down six dress sizes but unfortunately it was because she had developed bowel cancer . Unfortunately Mel ignored the symptoms and put them down to the side effects of her diet and exercise regime. Now after surgery and chemotherapy the married mother of two from Norwich, in Norfolk, has been told nothing more can be done to save her life. Because she missed the warning signs for so long the cancer has now spread to her liver and lungs. Brave: Mrs Swan-Horton with her husband Kristian. She says she wants to let make . everyone aware of the symptoms of bowel cancer before she dies . Bowel cancer affects more than 36,500 people in the UK every year. It is the second most common cancer in women - around 18,400 new cases were diagnosed in the UK in 2009. The biggest single risk factor is age as the chances of having bowel cancer go up after 50. 1 in 10 people are under 50 when they get bowel cancer - Melanie Swan-Horton (featured) is 46. Early bowel cancer may have no symptoms. Initial symptoms can also occur in people with less serious problems such as haemorrhoids.However, the NHS recommends seeing your doctor if you notice the following: . If you are worried about any symptoms you can also call the Bowel Cancer UK Support line on 0800 8403540 . She said: ‘I’m still having treatment, but it’s palliative care now just to keep me going. ‘I’m . mentally and physically exhausted and I’m finding it hard to keep going . at the moment but I don’t want to let this horrible disease win. ‘Before . I die I want every man and woman in Britain to know the signs of bowel . cancer. I thought I was getting really slim and fit but it was the . cancer killing me and other slimmers need to be aware.’ She . is concerned that in the run up to the holiday period women who are . dieting may also miss the tell tale signs and symptoms which are putting . their own lives at risk. The former charity manager said: ‘ I’m not trying to panic people but I genuinely thought I was slimmer, fitter and happier than ever. When I heard them say incurable cancer I was shocked and devastated. ‘I just want people to know the facts so they can get checked out. If I had it may have saved my life.’ Melanie, 46, started slimming to rebuild her confidence after a divorce two years ago. She followed a high protein plan for fast results and soon dropped from a size 16 to a size ten. Melanie met her new husband Kristian, 36, who regularly worked out at the gym and encouraged Melanie to join him. Within three months of working out with weights she had achieved an enviable bikini body. ‘When I looked in the mirror I couldn’t believe it was me. I’d gone from a frumpy middle-aged mother to toned and slim. It was astonishing and I loved buying a bikini for the first time in years.’ ‘My confidence rocketed and I wished I had done it sooner.' Big change: Mrs Swan-horton followed a high protein plan for fast results after meeting her now husband Kristian who regularly worked out . Working out: Training in the gym hid the signs of Mrs Swan-Horton's underlying bowel . cancer. As she thought she was getting healthier the cancer was . spreading . But at the same time she started feeling drained and tired. ‘ It was no surprise really because I was spending up to two hours in the gym so I thought nothing of it. When Melanie noticed blood in the toilet she was alarmed. ‘ I was shocked at first but then I remembered hearing that lifting weights could cause small bleeds so again I put it down to the exercise. ‘All the pieces of the puzzle seemed to fit. Of course if it had come out of the blue and if I hadn’t been dieting and lifting weights I would have gone to the GP immediately.’ But three months later with the bleeding persisting she finally saw her doctor. He referred her to hospital and two weeks later she went back for the results. It was then that she was told she was suffering from bowel cancer. Tying the knot: Melanie Swan-Horton and husband Kristian on their wedding day . Chemotherapy treatment to shrink the tumour to an operable size started immediately. But on the operating table doctors discovered the cancer had now spread to her liver. Despite further chemotherapy, tumours were then found in her lungs. A week after the devastating news Melanie married Kristian last September in Cyprus in front of her twin daughters aged 24. Back home she continued chemotherapy but was recently told it had stopped working. Disguised: Melanie says she doesn't want to scare people but wants them to know the facts about bowel cancer so they can get checked out . Battling: Mrs Swan-Horton has been told there is nothing more that can be done for her but says she doesn't want to give up fighting for more time with her daughters, husband and family and friends . ‘I was told there was nothing more that could be done for me. There was a slight possibility of taking part in clinical trials to keep it at bay for a little longer but I knew it would be draining so my doctors agreed I could go on holiday and think it over. ‘I decided to try but it’s very hard and I feel very poorly now. I’ll do my best to keep going though. ‘I haven’t asked how long I have because I don’t want to give up fighting for more time with my daughters, husband, family and friends. ‘They have all been amazing and I love them all to bits. I don’t think I would have got this far without their support. ‘If bowel cancer is found in time it can be cured so if you read my story and you have any of the symptoms please see your GP.’",Melanie Swan-Horton's dramatic weight loss was due to bowel cancer .\nShe had started high protein diet to slim down after divorce two years ago .\nAfter chemo and surgery nothing more can be done for married mum .\nDying mission is to educate people on how to spot cancer symptoms .
2,"By . Thomas Durante and Associated Press . PUBLISHED: . 16:57 EST, 19 March 2013 . | . UPDATED: . 16:58 EST, 19 March 2013 . Would-be killer: UCF student James Oliver Seevakumaran was planning a campus killing spree before killing himself, cops said . A student at the University of Central Florida student has been hailed a hero for his role in preventing a mass shooting by his roommate, who killed himself instead of putting his plan into action. Arabo 'BK' Babakhani says that he was the man who called 911 to report a gunman in a dorm room after his roommate, James Oliver Seevakumaran had just pulled a fire alarm, and raised a gun to Babakhani's face just after midnight on Monday. At a press conference on Monday evening, UCF Police Chief Richard Beary said that Babakhani's call, combined with the quick response time of UCF officers, 'may have changed [Seevakumaran's] ability to think quickly on his feet.' Pulling the fire alarm was the first step of an elaborate campus massacre plot by Seevakumaran, police have said, but as officers approached, he shot himself in the head in his bedroom. There, cops found Seevakumaran's body, two guns, hundreds of rounds of ammunition and a backpack filled with improvised explosives. Babakhani told the UCF student newspaper Knightly News that he darted into his bedroom and called 911. He told the paper: 'I got away from the door in case he barred into it. I crouched in front of my chair in case he fired into the door. I crouched in front of my chest of [drawers] in case he fired into a wall... I just didnt know where he was gonna fire.' Scroll down for video . Deadly weapon: University of Central Florida Police Chief Richard Beary, right, shows an example of the assault rifle found in the dorm room of James Oliver Seevakumaran . Death on campus: Cops investigating a report of a man with a gun found the body of Seevakumaran after he had shot himself in the head . Shooting: University of Central Florida police block off a street near the Tower 1 dorm after explosive devices were found while investigating a suicide in the dorm . Babakhani also described Seevakumaran as a loner, and said isn't sure he and his roommate ever locked eyes before pointing an assault rifle at him. He told the paper: 'I don’t know if he’s got any family. I've never heard him talk on his cell phone or I don't know if he’s got any family. I don’t know if he’s got any friends cause I've never seen them.' Babakhani added in the interview: 'For Thanksgiving... he didn't invite anyone. He just cooked a whole Thanksgiving meal for himself.' Plotter: Seevakumaran's roommate described the 30-year-old as a loner, who once cooked an entire Thanksgiving meal for himself . As police investigated, they found writings in Seevakumaran's room that detailed a plan of attack. A list found along with his dead body included getting drunk at a bar near campus before pulling the fire alarm. Beary says the final item was 'give them hell.' About 500 students were evacuated from the dorm on the Orlando campus after the fire alarm was pulled at about 12:21am, Beary said. Beary added that Seevakumaran had not . yet paid his school fees for this semester, and a process had begun to . kick him out of the dorm. He . had no prior disciplinary record at UCF, but he was arrested in 2006 . for misuse of a temporary tag and driving with a suspended license. His . writings indicated that he had been planning the attack since February, . and he had bought his weapons in an Orlando-area gunshop, police said. More details emerged Tuesday about Seevakumaran's solitary lifestyle. Seevakumaran's family said he was a loner who didn't have a history of violence in a brief statement released by authorities. Beary told the news conference that he acted alone and didn't have any friends. 'He didn't like to talk to people,' Beary said. UCF spokesman Grant Heston . said the Orange County Sheriff's Office and the FBI are helping with the investigation. Distraught: Evacuated students wait near the residence hall after the student's suicide early Monday morning . Response team: Various police agencies are seen during an investigation of a Seevakumaran's suicide on the University of Central Florida campus . Grisly scene: Arriving officers found a student dead from what appeared to be a self-inflicted gunshot wound inside a residence at the Tower 1 dorm . Investigation: About 500 students were evacuated from the dorm as police examined the explosives found inside . Just after noon on Monday, the explosives were removed from the dorm and classes resumed as scheduled. The school utilized text message alerts to keep students informed about the situation. Scare: Students were evacuated from the tower and moved to an open area about 1,000 feet from the UCF Arena . Antonio Whitehead, 21, a junior from . Hollywood, Florida, said he heard the fire alarm go off after midnight . and thought it was a routine alarm. He headed outside where he saw a crowd already heading across the street from the dorm. 'All . of a sudden, I felt the crowd move a little faster. And a police . officer with a machine gun or something told everyone to start moving a . lot faster,' he said. Whitehead, who has lived in the dorm . for two years, said the students were moved to an open area about 1,000 . feet from UCF Arena. Grant . Hernandez, 20, a sophomore from Orlando who also is a resident at the . dorm, said he woke up sometime after midnight when police were . evacuating the building. 'We weren't allowed to get our cars. We weren't allowed to get our personal effects,' Hernandez said. 'All we saw were people running, and they were not telling us what was going on,' he added. 'We were left unsure of things. It wasn't till about 6 o'clock that we got more information and a clearer picture of what was going on.' He said officers on the scene began providing more information, and students checked updates on the university's website. The area is a very busy section of the campus, with restaurants and shops nearby. A statement on the university's website said the UCF Arena would open to accommodate displaced students. Counselors would be available to talk to students who need assistance. Waiting game: University of Central Florida students wait outside the college sports arena after explosive devices were found in a nearby dorm . Homeless: A statement on the university's website said the UCF Arena would open to accommodate displaced students . Emergency: The Orange County Sheriff's Office and the FBI are helping with the investigation .","Roommate Arabo 'BK' Babakhani says James Seevakumaran was a loner, and didn't know if he had family'\nSeevakumaran was found dead early on Monday morning with a handgun, assault rifle and bag of explosives .\nPolice were dispatched to investigate a fire alarm, but received a call while en route from Babakhani about a man with a gun .\nCops believe Seevakumaran pulled the fire alarm to get his fellow students out in the open for a campus killing spree ."


## Preprocessing the data

In [None]:
model_checkpoint = "t5-base"

In [None]:
# Import tokenizer from model checkpoint
from transformers import AutoTokenizer   
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# If you are using one of the five T5 checkpoints we have to prefix 
# the inputs with "summarize:" (t5 is a multi-task model).

if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

For Xsum, the input tokens are about 1500 and the length of the summaries are about 160. Here we truncate to 1024 and 128

In [None]:
# tokenlize inputs into map

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/288 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'attention_mask', 'highlights', 'id', 'input_ids', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'attention_mask', 'highlights', 'id', 'input_ids', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'attention_mask', 'highlights', 'id', 'input_ids', 'labels'],
        num_rows: 11490
    })
})

## Fine-tuning the model

In [None]:
# Import Huggingface Automodel class from model checkpoint

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [None]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# keep track with wandb
wandb.init(project="T5-sum")

[34m[1mwandb[0m: Currently logged in as: [33mshusunny[0m (use `wandb login --relogin` to force relogin)


In [None]:
# Define compute_metrics
import nltk
import numpy as np
nltk.download('punkt')

metric = datasets.load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Define traing args, batch size and epoch

batch_size = 8
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-cnn",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=1000,  # set to 1000 for full training
    save_steps=2000,  # set to 500 for full training
    eval_steps=6000,  # set to 8000 for full training
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

In [None]:
# Pass into the trainer

train_dataset=tokenized_datasets["train"]
eval_dataset=tokenized_datasets["validation"]

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


We can now finetune our model by just calling the `train` method:

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: highlights, id, article.
***** Running training *****
  Num examples = 287113
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 35890
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
6000,2.205,1.920674,23.4855,10.086,19.0242,21.9616,18.9963
12000,2.218,1.932958,23.4304,10.0688,18.9898,21.9075,18.9958
18000,2.2153,1.932958,23.4304,10.0688,18.9898,21.9075,18.9958
24000,2.2149,1.932958,23.4304,10.0688,18.9898,21.9075,18.9958


Saving model checkpoint to t5-base-finetuned-cnn/checkpoint-2000
Configuration saved in t5-base-finetuned-cnn/checkpoint-2000/config.json
Model weights saved in t5-base-finetuned-cnn/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in t5-base-finetuned-cnn/checkpoint-2000/tokenizer_config.json
Special tokens file saved in t5-base-finetuned-cnn/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to t5-base-finetuned-cnn/checkpoint-4000
Configuration saved in t5-base-finetuned-cnn/checkpoint-4000/config.json
Model weights saved in t5-base-finetuned-cnn/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in t5-base-finetuned-cnn/checkpoint-4000/tokenizer_config.json
Special tokens file saved in t5-base-finetuned-cnn/checkpoint-4000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: highlights, id, article.
***** Running Evaluation *****
  

## Trying with a smaller dataset

In [None]:
# Init new logging params
wandb.init(project="T5-sum")

[34m[1mwandb[0m: Currently logged in as: [33mshusunny[0m (use `wandb login --relogin` to force relogin)


In [None]:
# Select to get smaller dataset
small_train = raw_datasets['train'].select(list(range(0, 5000)))
small_val = raw_datasets['validation'].select(list(range(0, 500)))
small_train

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 5000
})

In [None]:
# Processing
tokenized_train = small_train.map(preprocess_function, batched=True)
tokenized_val = small_val.map(preprocess_function, batched=True)
tokenized_train

  0%|          | 0/5 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-34bfa482b3a753b2.arrow


Dataset({
    features: ['article', 'attention_mask', 'highlights', 'id', 'input_ids', 'labels'],
    num_rows: 5000
})

In [None]:
# Import a new T5
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model_small = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_small)

In [None]:
# Define traing args, batch size and epoch
# batch size depends on the memory of GPU

batch_size = 8
epochs = 20
model_name = model_checkpoint.split("/")[-1]
args_small = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-cnn-small",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy = "epoch",
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

In [None]:
# Pass into the trainer

train_dataset=tokenized_train
eval_dataset=tokenized_val

trainer_small = Seq2SeqTrainer(
    model_small,
    args_small,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
trainer_small.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights.
***** Running training *****
  Num examples = 5000
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.7194,1.799034,26.221,11.0392,21.5077,24.0001,18.986
2,1.6473,1.794493,26.4134,11.0294,21.6155,24.1051,18.988
3,1.6025,1.799427,26.4672,11.1227,21.629,24.0712,18.994
4,1.549,1.80126,26.591,11.1225,21.7523,24.2295,18.984
5,1.5205,1.788304,26.3528,10.9932,21.6714,24.1115,18.996
6,1.5585,1.930049,24.1521,9.6696,19.8839,22.1657,19.0
7,1.777,2.099338,24.2821,9.3409,19.4313,21.9407,19.0
8,1.983,2.083384,24.1763,9.3217,19.468,21.9202,19.0
9,1.9561,2.085913,24.3888,9.4484,19.6017,22.0593,19.0
10,1.9518,2.087633,24.3328,9.3848,19.5043,22.047,19.0


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to t5-base-finetuned-cnn-small/checkpoint-625
Configuration saved in t5-base-finetuned-cnn-small/checkpoint-625/config.json
Model weights saved in t5-base-finetuned-cnn-small/checkpoint-625/pytorch_model.bin
tokenizer config file saved in t5-base-finetuned-cnn-small/checkpoint-625/tokenizer_config.json
Special tokens file saved in t5-base-finetuned-cnn-small/checkpoint-625/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to t5-base-finetuned-cnn-small/checkpoint-1250
Configuration saved in t5-bas

TrainOutput(global_step=12500, training_loss=1.8421403125, metrics={'train_runtime': 10104.5209, 'train_samples_per_second': 9.897, 'train_steps_per_second': 1.237, 'total_flos': 7.134444078170112e+16, 'train_loss': 1.8421403125, 'epoch': 20.0})

In [None]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▂▃▅▁▆███████████████
eval/loss,▁▁▁▁▁▄██████████████
eval/rouge1,▇▇██▇▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂
eval/rouge2,████▇▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/rougeL,▇████▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁
eval/rougeLsum,▇████▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▅▃▃▆▄▄▁▄▄▅▄▃█▇▅▄▄█▄▄
eval/samples_per_second,▃▆▆▃▅▅█▅▅▄▅▆▁▂▄▅▅▁▅▅
eval/steps_per_second,▄▆▆▃▅▅█▅▅▄▅▅▁▂▄▄▅▁▅▅
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/gen_len,19.0
eval/loss,2.08789
eval/rouge1,24.3279
eval/rouge2,9.4026
eval/rougeL,19.5101
eval/rougeLsum,22.0486
eval/runtime,41.1198
eval/samples_per_second,12.16
eval/steps_per_second,1.532
train/epoch,20.0


In [None]:
!ls

sample_data  t5-base-finetuned-cnn  t5-base-finetuned-cnn-small  wandb


In [None]:
!ls t5-base-finetuned-cnn/

checkpoint-22000  checkpoint-24000  checkpoint-26000


In [None]:
!ls t5-base-finetuned-cnn-small/

checkpoint-11250  checkpoint-11875  checkpoint-12500


In [None]:
from transformers import T5Tokenizer, T5Model

model = T5Model.from_pretrained('t5-base-finetuned-cnn/checkpoint-22000')

loading configuration file t5-base-finetuned-cnn/checkpoint-22000/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefi

In [None]:
model_small=T5Model.from_pretrained('t5-base-finetuned-cnn-small/checkpoint-11250')

loading configuration file t5-base-finetuned-cnn-small/checkpoint-11250/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      

In [None]:
# map data correctly
def generate_summary(batch):
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token # to avoid an error
    # cut off at BERT max length 512
    sentences = [prefix + sentence for sentence in batch["article"]]
    inputs = tokenizer(sentences, truncation=True, max_length=512, return_tensors="pt", padding=True)
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

In [None]:
test_data = raw_datasets['test']
test_data = test_data.select(range(16))

In [None]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

sentences = test_data["article"] 
inputs = tokenizer([prefix + sentence for sentence in sentences], max_length=max_input_length, return_tensors="pt", padding=True)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
