In [1]:
!pip install transformers datasets evaluate rouge_score



In [2]:
!pip install transformers accelerate



In [3]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_ckpt = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
#moves it to GPU
model_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
dataset_samsum = load_dataset("samsum")

In [7]:
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [8]:
prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
dataset_samsum_pt = dataset_samsum.map(preprocess_function, batched = True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [10]:
dataset_samsum_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [11]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_t5)

In [12]:
import evaluate
rouge = evaluate.load("rouge")

In [13]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="my_summarization_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    gradient_accumulation_steps=16,
    num_train_epochs=5)

trainer = Seq2SeqTrainer(
    model=model_t5,
    args=training_args,
    train_dataset=dataset_samsum_pt["train"],
    eval_dataset=dataset_samsum_pt["test"],
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
0,2.2305,1.834195
1,1.9943,1.793797
2,1.9334,1.767391
4,1.9017,1.755429


Checkpoint destination directory my_summarization_model/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=4600, training_loss=1.9741998423700746, metrics={'train_runtime': 4172.4572, 'train_samples_per_second': 17.654, 'train_steps_per_second': 1.102, 'total_flos': 2935456087769088.0, 'train_loss': 1.9741998423700746, 'epoch': 5.0})

In [16]:
model_t5.save_pretrained("t5-model")

In [17]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [18]:
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [19]:
#Prediction

gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 100}



sample_text = dataset_samsum["test"]["dialogue"][15]

reference = dataset_samsum["test"]["summary"][15]

pipe = pipeline("summarization", model="t5-model",tokenizer=tokenizer)

##
print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Dialogue:
Greg: Hi, honey. I need to stay after hours :-(
Betsy: Again?
Greg: I'm sorry!
Betsy: What about Johnny?
Greg: Well, could you pick him up? 
Betsy: What if I can't?
Greg: Betsy?
Betsy: What if I can't?
Greg: Can't you, really?
Betsy: I can't. Today I need to work long hours as well. Tuesdays are your days in the kindergarten.
Greg: Talk to you later. I'll see what I can do.
Betsy: You'd better think of something.
Greg: Oh. Just stop it now.

Reference Summary:
Greg and Betsy have a lot of work today, so they cannot pick up Johnny from the kindergarten. However, it's Greg's turn to do it. Greg will try to find a solution.

Model Summary:
Greg needs to stay after hours. Betsy can't pick Johnny up. He needs to work long hours. Tuesdays are his days in the kindergarten.


In [20]:
src_text=dataset_samsum["test"]["dialogue"][15]
batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
translated = model_t5.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)



In [21]:
tgt_text

["Greg can't do it. Betsy will talk to Greg later."]

In [15]:
#https://huggingface.co/docs/transformers/en/tasks/summarization