## Using T5 for abstractive summarization

In [5]:
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq

import pandas as pd
from datasets import Dataset
import random

In [6]:
base_model = T5ForConditionalGeneration.from_pretrained('t5-small')
base_tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 14.0MB/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 825kB/s]
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:01<00:00, 1.11MB/s]
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# https://kaggle.com/snap/amazon-fine-food-reviews?select=Review.csv

reviews = pd.read_csv('BERT_LLM/BERT_LLM/data/data/reviews.csv')

# Pre-processing step
# Punctuation is important in grammar and important for complex decoding architectures to knoe when to stop!
def add_punc(s):
    if s[-1] not in ('.', '!', '?'):
        s = s + '.'
    return s

reviews.dropna(inplace = True)

reviews['Summary'] = reviews['Summary'].map(add_punc)

print(reviews.shape)

reviews.head()


(96486, 3)


Unnamed: 0,Text,Summary,Score
0,Great taffy at a great price. There was a wid...,Great taffy.,5
1,This taffy is so good. It is very soft and ch...,"Wonderful, tasty taffy.",5
2,Right now I'm mostly just sprouting this so my...,Yay Barley.,5
3,This is a very healthy dog food. Good for thei...,Healthy Dog Food.,5
4,good flavor! these came securely packed... the...,fresh and greasy!,4


In [8]:
reviews = reviews[(reviews['Summary'].str.len() < 100) & (reviews['Summary'].str.len() >= 30)]

reviews.shape

(13073, 3)

In [9]:
random.seed(0)

reviews_dataset = Dataset.from_pandas(reviews.astype(str).sample(5000))

In [11]:
# We have a prompt but only as a prefix in the encoder
prefix = "summarize: "

# we will manually add our own labels because unlike GPT, we cannot asseume the labels are based on the inputs
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Text"]]
    model_inputs = base_tokenizer(inputs, max_length = 1024, truncation = True)

    labels = base_tokenizer(examples["Summary"], max_length = 128, truncation = True)

    model_inputs['labels'] = labels["input_ids"]
    return model_inputs

In [17]:
tokenized_reviews_dataset = reviews_dataset.map(preprocess_function, batched = True)

                                                                 

In [21]:
tokenized_reviews_dataset = tokenized_reviews_dataset.train_test_split(test_size = .1)

In [22]:
# Data collator specifically for generic sequence to sequence tasks
# Use when we are translating one sequence to another like translation, summarization, etc
data_collator = DataCollatorForSeq2Seq(tokenizer = base_tokenizer, model = base_model)

In [25]:
training_args = TrainingArguments(
    output_dir = './t5_summary_results',
    evaluation_strategy = 'epoch',
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    num_train_epochs = 20,
    load_best_model_at_end = True,
    logging_steps = 50,
    save_strategy = 'epoch'
)

trainer = Trainer(
    model = base_model,
    args = training_args,
    train_dataset = tokenized_reviews_dataset['train'],
    eval_dataset = tokenized_reviews_dataset['test'],
    data_collator = data_collator,
)

trainer.evaluate()

Trainer is attempting to log a value of "{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}" for key "task_specific_params" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.


{'eval_loss': 4.420648097991943,
 'eval_runtime': 0.705,
 'eval_samples_per_second': 709.248,
 'eval_steps_per_second': 22.696}

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.698,3.362388
2,3.5087,3.269763
3,3.4266,3.209758
4,3.2941,3.163584
5,3.2639,3.127627
6,3.1929,3.105354
7,3.195,3.076183
8,3.1417,3.05489
9,3.1076,3.037829
10,3.0417,3.02393


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2820, training_loss=3.129840888706505, metrics={'train_runtime': 345.0947, 'train_samples_per_second': 260.798, 'train_steps_per_second': 8.172, 'total_flos': 1216903603421184.0, 'train_loss': 3.129840888706505, 'epoch': 20.0})

In [27]:
trainer.evaluate()

{'eval_loss': 2.9659423828125,
 'eval_runtime': 0.7663,
 'eval_samples_per_second': 652.475,
 'eval_steps_per_second': 20.879,
 'epoch': 20.0}

In [28]:
trainer.save_model()

In [30]:
loaded_model = T5ForConditionalGeneration.from_pretrained('./t5_summary_results')

# summarization pipeline prepends a default prefix of summarize:
generator = pipeline(
    'summarization', model = loaded_model, tokenizer = base_tokenizer
)

In [34]:
sam = reviews.sample(1)

print(sam['Summary'])

text = sam['Text'].tolist()[0]
text

13440    Love the enriched original rice, milk, not pri...
Name: Summary, dtype: object


'We like this item, but wish that it was available in larger sizes, these are way too expensive......'

In [35]:
# Generate a summary
generator(text, min_length = 3, max_length = 15, early_stopping = True, num_beams = 2)

[{'summary_text': 'Great item, but wish it was available in larger sizes.'}]

In [37]:
# Try the base t5 on the same text
base_generator = pipeline(
    'summarization', model = 't5-small', tokenizer = 't5-small'
)

# Summary is a bit more extractive than our fine-tuned version and style isn't quite the same as our dataset
base_generator(text, min_length = 3, max_length = 15, early_stopping = True, num_beams = 2)

[{'summary_text': 'this item is too expensive to find . we like it, but'}]

In [40]:
# Sanity check: trying a different prefix. Not a good result

inputs = base_tokenizer("not my prompt: " + text, return_tensors = "pt")
outputs = loaded_model.generate(
    inputs["input_ids"], min_length = 3, max_length = 15
)

print(base_tokenizer.decode(outputs[0], skip_special_tokens = True))

This is a great item, but it is too expensive....
