In [1]:
%%capture
!pip install --upgrade transformers
!pip install datasets
!pip install rouge_score
!pip install rouge

In [2]:
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, load_metric, Dataset
import torch
import numpy as np
import pandas as pd
import io
import math
import time

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = "/content/drive/MyDrive/Colab Notebooks/CSC413/amazon_review_dataset_processed.csv"
df = pd.read_csv(path)
amazon = Dataset.from_pandas(df)
amazon.shape

(11848, 3)

In [5]:
tokenizer = AutoTokenizer.from_pretrained('flax-community/t5-base-cnn-dm')

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

In [6]:
task_prefix = "summarize: "
max_source_length = 512
max_target_length = 175

def preprocess_function(reviews):
  input_sequences = reviews['reviewText']
  inputs = [task_prefix + sequence for sequence in input_sequences]
  model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True, padding=True)

  summaries = reviews['summary']
  labels = tokenizer(summaries, max_length=max_target_length, truncation=True, padding=True)

  model_inputs['labels'] = labels['input_ids']
  return model_inputs

In [7]:
tokenized_amazon = amazon.map(preprocess_function, batched=True)


NotTest_Test = tokenized_amazon.train_test_split(test_size=0.1, seed=42)
NotTest = NotTest_Test["train"]
test = NotTest_Test["test"]

Train_Val = NotTest.train_test_split(test_size=0.1, seed=42)
train = Train_Val["train"]
val = Train_Val["test"]

print(train.shape, val.shape, test.shape)

  0%|          | 0/12 [00:00<?, ?ba/s]

(9596, 6) (1067, 6) (1185, 6)


In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained('flax-community/t5-base-cnn-dm')
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(torch_device)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Downloading:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = train,
    eval_dataset = val,
    tokenizer = tokenizer,
    data_collator = data_collator
)

trainer.train()

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, Unnamed: 0, reviewText. If summary, Unnamed: 0, reviewText are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9596
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3600


Epoch,Training Loss,Validation Loss
1,1.4958,1.395991
2,1.3389,1.370459
3,1.2273,1.376825


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, Unnamed: 0, reviewText. If summary, Unnamed: 0, reviewText are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evalua

TrainOutput(global_step=3600, training_loss=1.3884881401062013, metrics={'train_runtime': 2248.1219, 'train_samples_per_second': 12.805, 'train_steps_per_second': 1.601, 'total_flos': 1.753067975344128e+16, 'train_loss': 1.3884881401062013, 'epoch': 3.0})

In [10]:
trainer.save_model("./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2")

Saving model checkpoint to ./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2
Configuration saved in ./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2/config.json
Model weights saved in ./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2/pytorch_model.bin
tokenizer config file saved in ./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2/tokenizer_config.json
Special tokens file saved in ./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2/special_tokens_map.json


In [11]:
finetuned = AutoModelForSeq2SeqLM.from_pretrained("./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2")

loading configuration file ./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2/config.json
Model config T5Config {
  "_name_or_path": "./content/drive/MyDrive/Colab Notebooks/CSC413/finetunedModel2",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,


In [12]:
generated_summaries = []

In [13]:
x = torch.Tensor(test['input_ids']).long()
output_batch_size = 20
num_of_generate_loops=10

for l in range(num_of_generate_loops):
  print("In loop number: ", l)
  current_batch = x[l*output_batch_size:(l+1)*output_batch_size]
  start = time.time()
  outputs = finetuned.generate(current_batch, max_length=25, min_length=2, num_beams = 2, repetition_penalty = 2.5, early_stopping=True)
  print("Loop : ", l, "took: ", time.time() - start, "to run")
  decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  generated_summaries.append(decoded_outputs)

In loop number:  0
Loop :  0 took:  33.54739546775818 to run
In loop number:  1
Loop :  1 took:  32.33180856704712 to run
In loop number:  2
Loop :  2 took:  31.822972059249878 to run
In loop number:  3
Loop :  3 took:  31.652050495147705 to run
In loop number:  4
Loop :  4 took:  31.862390518188477 to run
In loop number:  5
Loop :  5 took:  32.0033392906189 to run
In loop number:  6
Loop :  6 took:  31.77623176574707 to run
In loop number:  7
Loop :  7 took:  31.608933687210083 to run
In loop number:  8
Loop :  8 took:  32.06151580810547 to run
In loop number:  9
Loop :  9 took:  31.94618511199951 to run


In [15]:
flattened_outputs = np.array(generated_summaries).flatten()

In [16]:
df_result = pd.DataFrame({'review':test['reviewText'][:200],'generated_summaries':flattened_outputs,'target_summaries':test['summary'][:200]})
df_result.to_csv('/content/drive/MyDrive/Colab Notebooks/CSC413/summarization_pretrained_T5_outputs.csv')

In [17]:
metric = load_metric('rouge')

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}
    return result

calc_rouge_scores(flattened_outputs, test['summary'][:200])

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

{'rouge1': 18.7, 'rouge2': 8.3, 'rougeL': 16.4, 'rougeLsum': 16.4}