T5-Large

In [None]:
#!/usr/bin/env python3
# Complete T5-Large Summarization Pipeline with Fixed Metric Loading
import torch
import evaluate
import nltk
import numpy as np
import os
import shutil
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# --------------------------------------------
# 1. Setup Environment
# --------------------------------------------
print(" Setting up environment...")
nltk.download("punkt", quiet=True)

# Cleanup potential conflicts
for path in ["rouge", "rouge_score"]:
    if os.path.exists(path):
        print(f" Removing conflicting path: {path}")
        shutil.rmtree(path, ignore_errors=True)

# --------------------------------------------
# 2. Hardware Configuration
# --------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Using device: {device}")
print(f"BF16 Supported: {torch.cuda.is_bf16_supported()}")
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")

# --------------------------------------------
# 3. Load and Prepare Dataset
# --------------------------------------------
print("\n Loading dataset...")
try:
    dataset = load_dataset("cnn_dailymail", "3.0.0")
    dataset = dataset.remove_columns([col for col in dataset["train"].column_names if col not in ["article", "highlights"]])

    # Reduced dataset for quick execution
    dataset["train"] = dataset["train"].select(range(1000))
    dataset["validation"] = dataset["validation"].select(range(200))
    dataset["test"] = dataset["test"].select(range(100))

    print(" Dataset samples:")
    print(f"Train: {len(dataset['train'])} | Val: {len(dataset['validation'])} | Test: {len(dataset['test'])}")
except Exception as e:
    raise RuntimeError(f"Failed to load dataset: {e}")

# --------------------------------------------
# 4. Initialize Tokenizer and Model
# --------------------------------------------
print("\n Loading model...")
model_name = "t5-large"
try:
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
except Exception as e:
    raise RuntimeError(f"Model loading failed: {e}")

# --------------------------------------------
# 5. Data Preprocessing
# --------------------------------------------
max_input_len = 1024
max_target_len = 256

def preprocess(example):
    try:
        inputs = ["summarize: " + (article if article else "") for article in example["article"]]
        highlights = [highlight if highlight else " " for highlight in example["highlights"]]

        model_inputs = tokenizer(
            inputs,
            max_length=max_input_len,
            truncation=True,
            padding="max_length"
        )
        labels = tokenizer(
            text_target=highlights,
            max_length=max_target_len,
            truncation=True,
            padding="max_length"
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    except Exception as e:
        print(f" Preprocessing error: {e}")
        return None

print("\n Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names,
    batch_size=128
)

# --------------------------------------------
# 6. Metrics Setup (Fixed Version)
# --------------------------------------------
print("\n Setting up metrics...")
try:
    # First try loading from evaluate
    rouge = evaluate.load("rouge")
except:
    # Fallback to local installation
    print(" Failed to load ROUGE from evaluate, trying alternative approach...")
    try:
        !pip install -q rouge-score
        from rouge_score import rouge_scorer
        rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

        # Create evaluate-compatible wrapper
        class RougeWrapper:
            def compute(self, predictions, references, **kwargs):
                scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
                count = 0
                for pred, ref in zip(predictions, references):
                    if isinstance(ref, list):
                        ref = ref[0]  # Take first reference if multiple
                    score = rouge.score(pred, ref)
                    for key in scores:
                        scores[key] += score[key].fmeasure
                    count += 1
                return {k: v/count for k, v in scores.items()}

        rouge = RougeWrapper()
    except Exception as e:
        raise RuntimeError(f"Failed to load ROUGE metric: {e}")

try:
    meteor = evaluate.load("meteor")
except:
    print(" METEOR not available, using dummy metric")
    meteor = type('', (), {'compute': lambda *args, **kwargs: {"meteor": 0}})()

def compute_metrics(eval_preds):
    try:
        preds, labels = eval_preds
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Handle single reference vs multiple references
        if isinstance(decoded_labels[0], str):
            decoded_labels = [[label] for label in decoded_labels]

        rouge_result = rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True
        )
        meteor_result = meteor.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )

        return {
            "rouge1": round(rouge_result.get("rouge1", 0), 4),
            "rouge2": round(rouge_result.get("rouge2", 0), 4),
            "rougeL": round(rouge_result.get("rougeL", 0), 4),
            "meteor": round(meteor_result.get("meteor", 0), 4)
        }
    except Exception as e:
        print(f" Metric computation failed: {e}")
        return {"rouge1": 0, "rouge2": 0, "rougeL": 0, "meteor": 0}

# --------------------------------------------
# 7. Training Configuration
# --------------------------------------------
print("\n Configuring training...")
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_summarization",
    eval_strategy="epoch",
    learning_rate=4e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    save_total_limit=2,
    predict_with_generate=True,
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    gradient_accumulation_steps=2,
    gradient_checkpointing=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics
)

# --------------------------------------------
# 8. Execute Training
# --------------------------------------------
print("\n Starting training...")
try:
    train_result = trainer.train()
    print(f"\n Training completed! Final metrics: {train_result.metrics}")
except Exception as e:
    raise RuntimeError(f"Training failed: {e}")

# --------------------------------------------
# 9. Save and Evaluate
# --------------------------------------------
print("\n Saving model...")
trainer.save_model("./t5_summarization_final")
tokenizer.save_pretrained("./t5_summarization_final")

print("\n Evaluating on test set...")
test_results = trainer.evaluate(
    eval_dataset=tokenized_dataset["test"],
    metric_key_prefix="test"
)
print("\n Test Results:")
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")

# --------------------------------------------
# 10. Generate Sample Summaries
# --------------------------------------------
def generate_summary(text, max_length=256):
    try:
        inputs = tokenizer(
            "summarize: " + text,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_len
        ).to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=4,
                early_stopping=True
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        return f"[ERROR] Generation failed: {e}"

print("\n Sample Summaries:")
for i in range(3):
    sample = dataset["test"][i]
    print(f"\n Article {i+1} (excerpt):")
    print(sample["article"][:200] + "...")
    print(f"\n Reference Summary:")
    print(sample["highlights"])
    print(f"\n Generated Summary:")
    print(generate_summary(sample["article"]))
    print("\n" + "="*80)

print("\n Pipeline executed successfully!")

 Setting up environment...
 Using device: cuda
BF16 Supported: True
GPU Memory: 42.5GB

 Loading dataset...
 Dataset samples:
Train: 1000 | Val: 200 | Test: 100

 Loading model...

⏳ Tokenizing dataset...

 Setting up metrics...
 Failed to load ROUGE from evaluate, trying alternative approach...
 METEOR not available, using dummy metric

 Configuring training...

 Starting training...


  trainer = Seq2SeqTrainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Meteor
1,0.3116,0.300751,0,0,0,0


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


 Metric computation failed: 'RougeWrapper' object has no attribute 'score'


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



 Training completed! Final metrics: {'train_runtime': 276.9959, 'train_samples_per_second': 3.61, 'train_steps_per_second': 0.451, 'total_flos': 4330094592000000.0, 'train_loss': 0.5249381093978882, 'epoch': 1.0}

 Saving model...

 Evaluating on test set...


 Metric computation failed: 'RougeWrapper' object has no attribute 'score'

 Test Results:
test_loss: 0.3205
test_runtime: 37.5028
test_samples_per_second: 2.6660
test_steps_per_second: 0.6670
epoch: 1.0000

 Sample Summaries:

 Article 1 (excerpt):
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territor...

 Reference Summary:
Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

 Generated Summary:
Palestinian Authority officially becomes 123rd member of International Criminal Court . Court has jurisdiction over alleged crimes committed in Palestinian territories . Israel and the United States, neither of which is an ICC member, opposed the move .


 Article 2 (ex