In [1]:
!nvidia-smi -L

# colab resource monitor
from urllib.request import urlopen
exec(urlopen("http://colab-monitor.smankusors.com/track.py").read())
_colabMonitor = ColabMonitor().start()

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-acf355a9-b5c7-8443-7a6f-f569475764b9)
Now live at : http://colab-monitor.smankusors.com/60da786800953


In [2]:
!pip install transformers
!pip install datasets
!pip install rouge-score
!pip nltk

from IPython.display import clear_output
clear_output()

# Fine-tuning a model on a summarization task
This notebook will show how to fine-tune one of the 🤗 Transformers model to a **summarization task**. We will use the **XSum dataset** (for extreme summarization) which contains BBC articles accompanied with single-sentence summaries.<br>
- [XSum from huggingface](https://huggingface.co/datasets/xsum)

In [3]:
# set parameters for model
model_checkpoint = "t5-small"
train_output_dir = 'drive/MyDrive/공부/huggingface-transformers/10_summarization_models'

# set parameters for training
batch_size = 8
epochs = 2


# check execution time for whole code
import time
s_time = time.time()

# import packages
import datasets

import pandas as pd
import numpy as np

import random
import collections
import tqdm
import nltk
nltk.download('punkt')

import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

from transformers import default_data_collator

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# datasets : 1.8.0  |  pd : 1.1.5  |  np : 1.19.5  |  tqdm : 4.41.1  |  nltk : 3.2.5  |  transformers : 4.8.1  |  torch : 1.9.0+cu102
print(f'datasets : {datasets.__version__}  |  pd : {pd.__version__}  |  np : {np.__version__}  |  tqdm : {tqdm.__version__}  |  nltk : {nltk.__version__}  |  transformers : {transformers.__version__}  |  torch : {torch.__version__}')
print('device :', device)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
datasets : 1.8.0  |  pd : 1.1.5  |  np : 1.19.5  |  tqdm : 4.41.1  |  nltk : 3.2.5  |  transformers : 4.8.1  |  torch : 1.9.0+cu102
device : cuda


In [4]:
%%time

############### Prepare Data ###############

# load dataset & metric
dset_dict = datasets.load_dataset('xsum')
metric = datasets.load_metric('rouge')

# # check dataset
# print('\n>>> dataset object :')
# display(dset_dict)
# print('\n>>> sample data :')
# display(dset_dict['train'][0])


# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


# add prefix when using t5 model
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""


# preprocess text (tokenizer)
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix+doc for doc in examples['document']]
    model_inputs = tokenizer(inputs,  max_length=max_input_length, truncation=True)

    # setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=max_target_length, truncation=True)
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


tokenized_datasets = dset_dict.map(preprocess_function, batched=True)  # batched=True -> use multi-threading to encode texts by batches together
# tokenized_datasets
print('\n>>> Successfully prepared data!\n')

Using custom data configuration default
Reusing dataset xsum (/root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)
Loading cached processed dataset at /root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-884dd787067e182e.arrow


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

Loading cached processed dataset at /root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-fc028db041fba585.arrow




>>> Successfully prepared data!

CPU times: user 31.4 s, sys: 365 ms, total: 31.8 s
Wall time: 11 s


In [5]:
############### Fine-tuning the model ###############

# load model (pretrained checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# set training arguments
args = Seq2SeqTrainingArguments(
    output_dir=train_output_dir,
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size =batch_size,
    weight_decay=0.01,
    save_strategy='epoch',
    # save_steps=30000,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,  # to properly generate summaries
    fp16=True if device=='cuda' else False,  # activate mixed precision training (to go a bit faster)
)

# set data collator (to pad the inputs&labels to the maximum length in the batch)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# function for metric computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # decode labels
    labels = np.where(labels!=-100, labels, tokenizer.pad_token_id)  # replace -100 in the lables as we can't decode them
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # add a new line after each sentence (for Rouge)
    decoded_preds = ['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ['\n'.join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # compute metric
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # extract a few lines
    result = {key:val.mid.fmeasure*100 for key, val in result.items()}

    # add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result['gen_len'] = np.mean(prediction_lens)
    
    return {k:round(v, 4) for k, v in result.items()}


# generate trainer & train the model
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, document, summary.
***** Running training *****
  Num examples = 204045
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 51012
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.6457,2.418404,29.1157,8.3596,23.0023,23.0034,18.8143
2,2.6089,2.391186,29.5538,8.6947,23.4383,23.4395,18.8216


  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, document, summary.
***** Running Evaluation *****
  Num examples = 11332
  Batch size = 8
Saving model checkpoint to drive/MyDrive/공부/huggingface-transformers/10_summarization_models/checkpoint-25506
Configuration saved in drive/MyDrive/공부/huggingface-transformers/10_summarization_models/checkpoint-25506/config.json
Model weights saved in drive/MyDrive/공부/huggingface-transformers/10_summarization_models/checkpoint-25506/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/공부/huggingface-transformers/10_summarization_models/checkpoint-25506/tokenizer_config.json
Special tokens file saved in drive/MyDrive

TrainOutput(global_step=51012, training_loss=2.6724714020658995, metrics={'train_runtime': 22377.5083, 'train_samples_per_second': 18.237, 'train_steps_per_second': 2.28, 'total_flos': 1.444857222976082e+17, 'train_loss': 2.6724714020658995, 'epoch': 2.0})

In [6]:
# check execution time for whole code
e_time = time.time()
time_elapsed = e_time - s_time
print(f'>>> Total time elapsed : {int(time_elapsed//60)} min {int(time_elapsed%60)} sec')

>>> Total time elapsed : 373 min 16 sec
