In [None]:
# Install all required libraries
!pip install transformers[torch] datasets rouge_score nltk evaluate sacrebleu -q

# Import everything needed
import torch
import nltk
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

# Download the 'punkt' tokenizer models from NLTK
# This is needed for sentence tokenization in the ROUGE metric
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True) # Added punkt_tab download

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load the CNN/Daily Mail dataset
dataset = load_dataset("cnn_dailymail", '3.0.0')

# --- IMPORTANT ---
# To make training faster, we'll use a small subset of the data.
# For a full, high-quality result, use the entire dataset.
train_sample = dataset["train"].shuffle(seed=42).select(range(100000))
val_sample = dataset["validation"].shuffle(seed=42).select(range(len(dataset["validation"])))
test_sample = dataset["test"].shuffle(seed=42).select(range(len(dataset["test"])))

In [None]:
# The T5 model was trained with this specific prefix for summarization tasks
prefix = "summarize: "

def preprocess_function(examples):
    """Tokenizes the articles (inputs) and summaries (labels)."""
    # Prepare the input text by adding the prefix
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Prepare the target summaries (labels) using the 'as_target_tokenizer' context manager
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization function to our datasets
tokenized_train = train_sample.map(preprocess_function, batched=True)
tokenized_val = val_sample.map(preprocess_function, batched=True)
tokenized_test = test_sample.map(preprocess_function, batched=True)

print("âœ… Data preprocessing complete.")

In [None]:
# Load the ROUGE and BLEU metrics from the 'evaluate' library
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    """Computes ROUGE and BLEU scores for a batch of predictions."""
    predictions, labels = eval_pred

    # Decode the generated summaries and reference summaries
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 (pad token) to decode the labels correctly
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE score expects a newline between sentences
    rouge_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    rouge_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    rouge_result = rouge_metric.compute(predictions=rouge_preds, references=rouge_labels, use_stemmer=True)
    rouge_result = {key: value * 100 for key, value in rouge_result.items()}

    # BLEU score expects a list of references for each prediction
    bleu_labels = [[label] for label in decoded_labels]
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=bleu_labels)

    # Combine all metrics into one dictionary
    combined_metrics = {**rouge_result, "bleu": bleu_result["score"]}

    return {k: round(v, 4) for k, v in combined_metrics.items()}

In [None]:
# The DataCollator handles dynamic padding of batches
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,      # Crucial for seq2seq tasks
    fp16=torch.cuda.is_available(),  # Enable mixed-precision training if on GPU
)

# Initialize the Trainer with all components
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # This is where the evaluation function is passed
)

# Start training!
print("ðŸš€ Starting the fine-tuning process...")
trainer.train()
print("âœ… Fine-tuning complete!")

In [None]:
print("\n--- Testing with a sample article ---")

# Select an article from the test set
article_text = test_sample[25]["article"]
reference_summary = test_sample[25]["highlights"]

# Prepare the input for the model
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = tokenizer("summarize: " + article_text, return_tensors="pt", max_length=512, truncation=True).to(device)

# Generate the summary
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=150,
    min_length=40,
    length_penalty=2.0,
    num_beams=4,
    early_stopping=True
)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the results
print("\nOriginal Article:\n", article_text)
print("\nReference Summary:\n", reference_summary)
print("\nâœ¨ Generated Summary:\n", generated_summary)

In [None]:
print("\n--- Starting Final Evaluation on the Test Set ---")

# The predict() method will run the model on the test set and compute metrics
results = trainer.predict(tokenized_test)

print("\n--- ROUGE & BLEU Evaluation Results ---")
print(results.metrics)