In [1]:
#Install relevant libraires
!pip install transformers torch datasets

import transformers
import torch
import datasets

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

#Load pretrained FLAN-T5 model, small for compatibility with Collab GPU runtime limit
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#Ensure GPU is being used
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(device)

#Loading the xsum dataset
dataset = load_dataset("EdinburghNLP/xsum", split="train")

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m471.6/471.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

cuda


0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [2]:
dataset_split = dataset.train_test_split(test_size=0.1)

#Only using 10% of initial training set for runtime limit adherence
small_train_dataset = dataset_split['train'].train_test_split(test_size=0.90)['train']
eval_dataset = dataset_split['test']

def preprocess_function(examples):
  inputs = [inp for inp in examples['document']]    #extracting input
  model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(examples['summary'], max_length=128, padding="max_length", truncation=True, return_tensors="pt")
  model_inputs["labels"] = labels["input_ids"]
  model_inputs = {k: v.to(device) for k, v in model_inputs.items()}   #moving to GPU
  return model_inputs

tokenized_train_dataset = small_train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/18364 [00:00<?, ? examples/s]



Map:   0%|          | 0/20405 [00:00<?, ? examples/s]

In [3]:
#Setting parameters for training

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    #Batch size variable for training quality
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    #Epoch number variable for training quality
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer
)



In [4]:
#Main training stage
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6384,0.551748
2,0.6155,0.548738
3,0.6096,0.548065


TrainOutput(global_step=6888, training_loss=1.1346906314189702, metrics={'train_runtime': 4181.886, 'train_samples_per_second': 13.174, 'train_steps_per_second': 1.647, 'total_flos': 1.0241079685152768e+16, 'train_loss': 1.1346906314189702, 'epoch': 3.0})

In [5]:
metrics = trainer.evaluate()
print(metrics)

#Making a function to run the model
def summarize(text):
  inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
  summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

{'eval_loss': 0.5480645298957825, 'eval_runtime': 336.1734, 'eval_samples_per_second': 60.698, 'eval_steps_per_second': 7.588, 'epoch': 3.0}


In [7]:
print(summarize(
    """
Halifax, the capital of Nova Scotia, is a vibrant coastal city known for its rich
 maritime history and lively waterfront. Situated on the Atlantic Ocean, Halifax
 boasts a natural harbor that has been a critical port for centuries. The city
 is home to historic landmarks such as the Halifax Citadel, a star-shaped
 fortress overlooking the city, and Pier 21, which served as an entry point
 for many immigrants to Canada. Halifax offers a mix of modern culture, with
 bustling markets, local eateries, and lively festivals, all while maintaining
 its charm with scenic parks, walking trails, and beautiful ocean views.
"""
))

Halifax, Nova Scotia, is the capital of Nova Scotia.
