**Appraoch & Highlights Of This Notebook**

Model & Training Startegy For Summarisation - T5 Small Fine Tuning Without PEFT<br>

In [None]:
# Install the essential libraries
!pip install -U pip
!pip install transformers datasets evaluate rouge_score
!pip install torch torchvision torchaudio #--index-url https://download.pytorch.org/whl/cu121  # adjust for your CUDA/CPU
!pip install transformers datasets peft accelerate evaluate rouge-score nltk sentencepiece



In [3]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"MPS built: {torch.backends.mps.is_built()}")

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

PyTorch version: 2.10.0
MPS available: True
MPS built: True
Using device: mps


In [4]:
from datasets import load_dataset

# This one command downloads, splits, and structures the data
dataset = load_dataset("cnn_dailymail", "3.0.0")
# https://huggingface.co/datasets/ccdv/cnn_dailymail

# You will now have a 'DatasetDict' with all three splits
print(dataset)

#small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
#small_eval_dataset = dataset["validation"].shuffle(seed=42).select(range(200))

#small_train_dataset = dataset["train"].shuffle(seed=42).select(range(20000))
#small_eval_dataset = dataset["validation"].shuffle(seed=42).select(range(500))

small_train_dataset = dataset["train"].shuffle(seed=42).select(range(100000))
small_eval_dataset = dataset["validation"].shuffle(seed=42).select(range(1000))


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 1. Choose the model checkpoint
model_checkpoint = "t5-small"

# 2. Load the tokenizer
# The tokenizer will turn your text into 'input_ids'
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

# 3. Load the model
# T5ForConditionalGeneration is the T5 model for tasks like summarization
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 131/131 [00:00<00:00, 1990.92it/s, Materializing param=shared.weight]                                                      


In [6]:
# The prefix tells T5 what task to perform
prefix = "summarize: "

# Define the preprocessing function
def preprocess_function(examples):
    # 1. Add the prefix to all articles
    inputs = [prefix + doc for doc in examples["article"]]

    # 2. Tokenize the articles (our inputs)
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # 3. Tokenize the highlights (our labels/targets)
    # UPDATED: No longer using as_target_tokenizer()
    labels= tokenizer(
        text_target=examples["highlights"],
          max_length=128, 
          truncation=True)

    # 4. Set the 'labels' for the model
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



# Apply this function to all splits in our dataset
# 'batched=True' processes multiple examples at once for speed
#tokenized_datasets = dataset.map(preprocess_function, batched=True)

tokenized_train_dataset = small_train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = small_eval_dataset.map(preprocess_function, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100000/100000 [00:51<00:00, 1951.88 examples/s]


In [7]:
import evaluate
import nltk
import numpy as np

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# We need to download this for ROUGE to work
nltk.download("punkt", quiet=True)

# The function that will be called to compute metrics
def compute_metrics(eval_pred):
    # 'eval_pred' gives us model predictions and the true labels
    predictions, labels = eval_pred

    # 1. Decode the generated IDs back to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # 2. Decode the label IDs back to text
    # We replace -100 (which are padding tokens)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # 3. Add newlines for ROUGE (it expects summaries to be on separate lines)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # 4. Compute the ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # 5. Extract the 'rouge1', 'rouge2', 'rougeL' scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

Using the latest cached version of the module from /Users/mruksad/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--rouge/6e5315f72865c2eaa764c8361360bb938740b9c120a2cf3a7ad218aa0ce452ed (last modified on Wed Feb 11 07:39:47 2026) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.


In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# 1. Define the Training Arguments for the upscaled run
training_args = Seq2SeqTrainingArguments(
    output_dir="my_t5_summarizer_upscaled", # Use a new directory
    
    # --- Critical Changes for Upscaling ---
    eval_strategy="no",                   # 1. DO NOT evaluate during training
    report_to="none",                     # 2. Keep our fix from before
    logging_steps=200,                    # 3. Just log progress
    # -------------------------------------

    learning_rate=2e-5,
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,    
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,              
    predict_with_generate=True,      
    fp16=True,                       
    push_to_hub=False,
)

# 2. Create the Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 3. Initialize the Trainer.
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,  
    eval_dataset=tokenized_eval_dataset,    
    data_collator=data_collator,
    compute_metrics=compute_metrics,      
)

# 4. Start Training! ðŸš€
# This will take 1-2 hours on a Kaggle GPU, but it will not stop.
print("Starting upscaled training...")
trainer.train()
print("Training complete!")

# 5. Save your final model
trainer.save_model("my_final_t5_model_upscaled")
print("Upscaled model saved successfully!")

Starting upscaled training...


  super().__init__(loader)


Step,Training Loss
200,2.182427
400,0.0
600,0.0
800,0.0
1000,0.0
1200,0.0
1400,0.0
1600,0.0
1800,0.0
2000,0.0


Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  5.91it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  5.97it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  3.72it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  4.96it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  5.25it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  9.09it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  4.96it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  9.06it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  7.48it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  7.96it/s]
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  7.15it/s]
Writing model shards: 100%|â–ˆâ–

RuntimeError: MPS backend out of memory (MPS allocated: 18.46 GiB, other allocations: 11.50 GiB, max allowed: 30.19 GiB). Tried to allocate 256.00 MiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [7]:
# --- NEW STEP: Run Final Evaluation ---

print("Running final evaluation on the 500 validation examples...")
print("This may take 5-15 minutes as it generates summaries.")

# This command tells the trainer to run the evaluation
# on the 'eval_dataset' one time.
metrics = trainer.evaluate()

print("\n--- FINAL ROUGE SCORES ---")
print(metrics)

# This will save your scores to a file for your report
import json
with open("upscaled_metrics.json", "w") as f:
    json.dump(metrics, f)

Running final evaluation on the 500 validation examples...
This may take 5-15 minutes as it generates summaries.





--- FINAL ROUGE SCORES ---
{'eval_loss': 1.667940378189087, 'eval_rouge1': 25.9107, 'eval_rouge2': 12.897, 'eval_rougeL': 21.4397, 'eval_rougeLsum': 24.4141, 'eval_runtime': 38.0104, 'eval_samples_per_second': 13.154, 'eval_steps_per_second': 0.842, 'epoch': 3.0}


In [9]:
import nltk
from datasets import load_dataset
from tqdm.auto import tqdm # Shows a progress bar

nltk.download('punkt') # Make sure the sentence tokenizer is downloaded

# Load your (non-tokenized) eval dataset again
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(500))

# 1. The Lead-3 Function
def lead3_summarizer(article):
    # Split into sentences
    sentences = nltk.sent_tokenize(article)
    # Get the first 3
    lead_3 = " ".join(sentences[:3])
    return lead_3

# 2. Get all predictions and labels
print("Generating Lead-3 summaries...")
lead_3_predictions = [lead3_summarizer(article) for article in tqdm(eval_dataset["article"])]
human_labels = [summary for summary in tqdm(eval_dataset["highlights"])]

# 3. Get the ROUGE score
# We can't use the full 'compute_metrics' function as-is 
# because it's designed for the Trainer. We'll call the metric directly.

# This is the same ROUGE metric from your Step 5
from evaluate import load
rouge = load("rouge")

# Prepare for ROUGE (it likes newlines between sentences)
decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in lead_3_predictions]
decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in human_labels]

# Calculate ROUGE
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
result = {key: value * 100 for key, value in result.items()}
result = {k: round(v, 4) for k, v in result.items()}

print("\n--- LEAD-3 BASELINE ROUGE SCORES ---")
print(result)

# Your T5-Small model (25.9 ROUGE-1) is now "Baseline 2"

Generating Lead-3 summaries...


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]


--- LEAD-3 BASELINE ROUGE SCORES ---
{'rouge1': 41.8318, 'rouge2': 19.3325, 'rougeL': 26.4416, 'rougeLsum': 38.1535}


In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Set the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load your fine-tuned model and tokenizer
# This is the directory where 'trainer.save_model()' saved your files
model_path = "my_final_t5_model_upscaled" 
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device) # Move model to GPU

# 2. Define a new, unseen article to summarize
ARTICLE_TO_SUMMARIZE = (
    "In a significant advancement for artificial intelligence, researchers today "
    "announced the development of a new algorithm that can learn from "
    "vastly smaller datasets. This breakthrough, named 'Sparse Learning', "
    "could democratize AI by allowing smaller companies and individuals to "
    "build powerful models without the need for massive computational resources. "
    "The algorithm works by identifying and focusing on the most critical "
    "pieces of information, ignoring redundant data, which leads to faster "
    "training times and reduced computational cost."
)

# 3. Prepare the input text (add the T5 prefix)
text = "summarize: " + ARTICLE_TO_SUMMARIZE

# 4. Tokenize the text and move tensors to the GPU
inputs = tokenizer(
    text, 
    return_tensors="pt",  # Return PyTorch tensors
    max_length=1024,      # Max input length (matches what we trained on)
    truncation=True
).to(device) # Move the input tensors to the GPU

# 5. Generate the summary
# model.generate() is the main function for inference
# We use 'num_beams' for beam search to get a better quality summary
summary_ids = model.generate(
    inputs["input_ids"], 
    num_beams=4,       # Number of "paths" to explore
    max_length=150,    # Set a max length for the output summary
    min_length=30,     # Set a min length
    early_stopping=True # Stop when the model is confident the summary is done
)

# 6. Decode the generated IDs back to text
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# 7. Print the results
print("\n--- ORIGINAL ARTICLE ---")
print(ARTICLE_TO_SUMMARIZE)
print("\n--- GENERATED SUMMARY ---")
print(summary)


--- ORIGINAL ARTICLE ---
In a significant advancement for artificial intelligence, researchers today announced the development of a new algorithm that can learn from vastly smaller datasets. This breakthrough, named 'Sparse Learning', could democratize AI by allowing smaller companies and individuals to build powerful models without the need for massive computational resources. The algorithm works by identifying and focusing on the most critical pieces of information, ignoring redundant data, which leads to faster training times and reduced computational cost.

--- GENERATED SUMMARY ---
This breakthrough could democratize AI by allowing smaller companies and individuals to build powerful models without the need for massive computational resources. The algorithm works by identifying and focusing on the most critical pieces of information, ignoring redundant data, which leads to faster training times and reduced computational cost.
