**Appraoch & Highlights Of This Notebook**

Model & Training Startegy For Summarisation - T5 Small Fine Tuning Without PEFT<br>

In [1]:
# Install the essential libraries
!pip install -U pip
!pip install transformers datasets evaluate rouge_score
!pip install torch torchvision torchaudio #--index-url https://download.pytorch.org/whl/cu121  # adjust for your CUDA/CPU
!pip install transformers datasets peft accelerate evaluate rouge-score nltk sentencepiece



Collecting pip
  Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-26.0.1
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (pyproject.toml) ... [?25l[?25hdone
  Created wheel for rouge_score:

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch, os, platform
print("platform:", platform.platform())
print("cuda available:", torch.cuda.is_available())
print("mps available:", torch.backends.mps.is_available())

In [4]:
import torch

print(f"PyTorch version: {torch.__version__}")

# Check for CUDA (NVIDIA GPU) availability first, then MPS, then CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA available: True")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS available: True")
else:
    device = torch.device("cpu")
    print("CUDA available: False")
    print("MPS available: False")

print(f"Using device: {device}")

PyTorch version: 2.10.0+cu128
CUDA available: True
Using device: cuda


In [5]:
from datasets import load_dataset

# This one command downloads, splits, and structures the data
dataset = load_dataset("cnn_dailymail", "3.0.0")
# https://huggingface.co/datasets/ccdv/cnn_dailymail

# You will now have a 'DatasetDict' with all three splits
print(dataset)

#small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
#small_eval_dataset = dataset["validation"].shuffle(seed=42).select(range(200))

#small_train_dataset = dataset["train"].shuffle(seed=42).select(range(20000))
#small_eval_dataset = dataset["validation"].shuffle(seed=42).select(range(500))

small_train_dataset = dataset["train"].shuffle(seed=42).select(range(100000))
small_eval_dataset = dataset["validation"].shuffle(seed=42).select(range(1000))


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 1. Choose the model checkpoint
model_checkpoint = "t5-small"

# 2. Load the tokenizer
# The tokenizer will turn your text into 'input_ids'
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

# 3. Load the model
# T5ForConditionalGeneration is the T5 model for tasks like summarization
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/131 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
# The prefix tells T5 what task to perform
prefix = "summarize: "

# Define the preprocessing function
def preprocess_function(examples):
    # 1. Add the prefix to all articles
    inputs = [prefix + doc for doc in examples["article"]]

    # 2. Tokenize the articles (our inputs)
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # 3. Tokenize the highlights (our labels/targets)
    # UPDATED: No longer using as_target_tokenizer()
    labels= tokenizer(
        text_target=examples["highlights"],
          max_length=128, 
          truncation=True)

    # 4. Set the 'labels' for the model
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



# Apply this function to all splits in our dataset
# 'batched=True' processes multiple examples at once for speed
#tokenized_datasets = dataset.map(preprocess_function, batched=True)

tokenized_train_dataset = small_train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = small_eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
import evaluate
import nltk
import numpy as np

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# We need to download this for ROUGE to work
nltk.download("punkt", quiet=True)

# The function that will be called to compute metrics
def compute_metrics(eval_pred):
    # 'eval_pred' gives us model predictions and the true labels
    predictions, labels = eval_pred

    # 1. Decode the generated IDs back to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # 2. Decode the label IDs back to text
    # We replace -100 (which are padding tokens)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # 3. Add newlines for ROUGE (it expects summaries to be on separate lines)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # 4. Compute the ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # 5. Extract the 'rouge1', 'rouge2', 'rougeL' scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
import os
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers.trainer_utils import get_last_checkpoint

# --------- Checkpointing / Resume config (Colab-safe) ---------
# Make sure you've run the Drive mount cell before this.
DRIVE_BASE = "/content/drive/MyDrive/text-summarization"
RUN_NAME = "t5_small_no_peft_upscaled"

# All checkpoints will be saved under this folder on Drive
output_dir = os.path.join(DRIVE_BASE, RUN_NAME, "checkpoints")
final_model_dir = os.path.join(DRIVE_BASE, RUN_NAME, "final_model")
metrics_path = os.path.join(DRIVE_BASE, RUN_NAME, "upscaled_metrics.json")

os.makedirs(output_dir, exist_ok=True)
os.makedirs(final_model_dir, exist_ok=True)
print("Checkpoint output_dir:", output_dir)

# 1. Define the Training Arguments for the upscaled run
training_kwargs = dict(
    output_dir=output_dir,

    # --- Critical Changes for Upscaling ---
    eval_strategy="no",                    # some versions use `evaluation_strategy` (handled below)
    report_to="none",
    logging_steps=200,
    # -------------------------------------

    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,

    # --- NEW: save checkpoints periodically ---
    save_strategy="steps",
    save_steps=1000,                        # adjust based on runtime (e.g., 500/1000/2000)
    save_total_limit=3,
    # -----------------------------------------
 )

# Compatibility: older Transformers uses `evaluation_strategy` instead of `eval_strategy`
try:
    training_args = Seq2SeqTrainingArguments(**training_kwargs)
except TypeError as e:
    if "eval_strategy" in training_kwargs:
        training_kwargs["evaluation_strategy"] = training_kwargs.pop("eval_strategy")
        training_args = Seq2SeqTrainingArguments(**training_kwargs)
    else:
        raise e

# 2. Create the Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 3. Initialize the Trainer.
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
 )

# 4. Start / Resume Training
last_checkpoint = get_last_checkpoint(output_dir) if os.path.isdir(output_dir) else None
if last_checkpoint is not None:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
else:
    print("No checkpoint found. Starting training from scratch.")

print("Starting training...")
trainer.train(resume_from_checkpoint=last_checkpoint)
print("Training complete!")

# 5. Save your final model (also on Drive)
trainer.save_model(final_model_dir)
print("Final model saved to:", final_model_dir)

Checkpoint output_dir: /content/drive/MyDrive/text-summarization/t5_small_no_peft_upscaled/checkpoints
Resuming training from checkpoint: /content/drive/MyDrive/text-summarization/t5_small_no_peft_upscaled/checkpoints/checkpoint-28000
Starting training...


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Step,Training Loss
28200,1.829778
28400,1.850463
28600,1.865371
28800,1.840033
29000,1.822613
29200,1.820055
29400,1.838468
29600,1.828364
29800,1.858482
30000,1.835651


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Training complete!


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Final model saved to: /content/drive/MyDrive/text-summarization/t5_small_no_peft_upscaled/final_model


In [15]:
import nltk

# Make downloads deterministic in Colab
NLTK_DIR = "/content/nltk_data"
output_dir = os.path.join(DRIVE_BASE, RUN_NAME, NLTK_DIR)
os.makedirs(output_dir, exist_ok=True)
nltk.data.path.append(output_dir)

nltk.download("punkt", download_dir=output_dir)
# Newer NLTK may require this as well
nltk.download("punkt_tab", download_dir=output_dir)

print("NLTK data paths:", nltk.data.path)

NLTK data paths: ['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/content/nltk_data', '/content/nltk_data', '/content/nltk_data']


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
# --- Run Final Evaluation ---

print("Running final evaluation on the validation examples...")
print("This may take several minutes as it generates summaries.")

# This command tells the trainer to run the evaluation
# on the 'eval_dataset' one time.
metrics = trainer.evaluate()

print("\n--- FINAL ROUGE SCORES ---")
print(metrics)

# Save scores to Drive (path is defined in the training cell)
import json
with open(metrics_path, "w") as f:
    json.dump(metrics, f)
print("Metrics saved to:", metrics_path)

Running final evaluation on the validation examples...
This may take several minutes as it generates summaries.

--- FINAL ROUGE SCORES ---
{'eval_loss': 1.6220036745071411, 'eval_rouge1': 25.6728, 'eval_rouge2': 12.6292, 'eval_rougeL': 21.0744, 'eval_rougeLsum': 24.1449, 'eval_runtime': 88.3245, 'eval_samples_per_second': 11.322, 'eval_steps_per_second': 1.415, 'epoch': 3.0}
Metrics saved to: /content/drive/MyDrive/text-summarization/t5_small_no_peft_upscaled/upscaled_metrics.json


In [17]:
import nltk
from datasets import load_dataset
from tqdm.auto import tqdm # Shows a progress bar

nltk.download('punkt') # Make sure the sentence tokenizer is downloaded

# Load your (non-tokenized) eval dataset again
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(500))

# 1. The Lead-3 Function
def lead3_summarizer(article):
    # Split into sentences
    sentences = nltk.sent_tokenize(article)
    # Get the first 3
    lead_3 = " ".join(sentences[:3])
    return lead_3

# 2. Get all predictions and labels
print("Generating Lead-3 summaries...")
lead_3_predictions = [lead3_summarizer(article) for article in tqdm(eval_dataset["article"])]
human_labels = [summary for summary in tqdm(eval_dataset["highlights"])]

# 3. Get the ROUGE score
# We can't use the full 'compute_metrics' function as-is 
# because it's designed for the Trainer. We'll call the metric directly.

# This is the same ROUGE metric from your Step 5
from evaluate import load
rouge = load("rouge")

# Prepare for ROUGE (it likes newlines between sentences)
decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in lead_3_predictions]
decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in human_labels]

# Calculate ROUGE
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
result = {key: value * 100 for key, value in result.items()}
result = {k: round(v, 4) for k, v in result.items()}

print("\n--- LEAD-3 BASELINE ROUGE SCORES ---")
print(result)

# Your T5-Small model (25.9 ROUGE-1) is now "Baseline 2"

Generating Lead-3 summaries...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]


--- LEAD-3 BASELINE ROUGE SCORES ---
{'rouge1': np.float64(41.8318), 'rouge2': np.float64(19.3325), 'rougeL': np.float64(26.4416), 'rougeLsum': np.float64(38.1535)}


In [18]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import os
import torch

# Set the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your fine-tuned model and tokenizer from Drive
DRIVE_BASE = "/content/drive/MyDrive/text-summarization"
RUN_NAME = "t5_small_no_peft_upscaled"
model_path = os.path.join(DRIVE_BASE, RUN_NAME, "final_model")

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

# Define a new, unseen article to summarize
ARTICLE_TO_SUMMARIZE = (
    "In a significant advancement for artificial intelligence, researchers today "
    "announced the development of a new algorithm that can learn from "
    "vastly smaller datasets. This breakthrough, named 'Sparse Learning', "
    "could democratize AI by allowing smaller companies and individuals to "
    "build powerful models without the need for massive computational resources. "
    "The algorithm works by identifying and focusing on the most critical "
    "pieces of information, ignoring redundant data, which leads to faster "
    "training times and reduced computational cost."
 )

# Prepare the input text (add the T5 prefix)
text = "summarize: " + ARTICLE_TO_SUMMARIZE

# Tokenize the text and move tensors to the device
inputs = tokenizer(
    text,
    return_tensors="pt",
    max_length=1024,
    truncation=True,
 ).to(device)

# Generate the summary
summary_ids = model.generate(
    inputs["input_ids"],
    num_beams=4,
    max_length=150,
    min_length=30,
    early_stopping=True,
 )

# Decode the generated IDs back to text
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\n--- ORIGINAL ARTICLE ---")
print(ARTICLE_TO_SUMMARIZE)
print("\n--- GENERATED SUMMARY ---")
print(summary)

Loading weights:   0%|          | 0/131 [00:00<?, ?it/s]


--- ORIGINAL ARTICLE ---
In a significant advancement for artificial intelligence, researchers today announced the development of a new algorithm that can learn from vastly smaller datasets. This breakthrough, named 'Sparse Learning', could democratize AI by allowing smaller companies and individuals to build powerful models without the need for massive computational resources. The algorithm works by identifying and focusing on the most critical pieces of information, ignoring redundant data, which leads to faster training times and reduced computational cost.

--- GENERATED SUMMARY ---
'Sparse Learning' could democratize AI by allowing smaller companies and individuals to build powerful models without the need for massive computational resources.
