In [None]:
from datasets import load_dataset
data = load_dataset("knkarthick/samsum")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load model and tokenizer
model_path = "google/pegasus-cnn_dailymail"
summarize_tokenizer = AutoTokenizer.from_pretrained(model_path)
summrize_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Fix device selection
device = "cuda" if torch.cuda.is_available() else "cpu"
summrize_model = summrize_model.to(device)

In [None]:
def convert_examples_to_features(batch):
    # Ensure all dialogues and summaries are strings
    dialogues = []
    summaries = []

    for dialogue, summary in zip(batch["dialogue"], batch["summary"]):
        # Convert to string and handle None values
        dialogue_str = str(dialogue) if dialogue is not None else ""
        summary_str = str(summary) if summary is not None else ""

        dialogues.append(dialogue_str)
        summaries.append(summary_str)

    # Tokenize inputs (dialogue)
    model_inputs = summarize_tokenizer(
        dialogues,
        max_length=1024,
        truncation=True,
        padding="max_length",
        return_tensors=None  # Important: return Python lists, not tensors
    )

    # Tokenize targets (summary)
    labels = summarize_tokenizer(
        summaries,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors=None  # Important: return Python lists, not tensors
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_dataset_train = data["train"].map(convert_examples_to_features,batched=True,remove_columns=["id", "dialogue", "summary"])
tokenized_dataset_test = data["test"].map(convert_examples_to_features,batched=True,remove_columns=["id", "dialogue", "summary"])
tokenized_dataset_validation = data["validation"].map(convert_examples_to_features,batched=True,remove_columns=["id", "dialogue", "summary"])


In [None]:
!pip install rouge_score

In [None]:
import evaluate
import numpy as np
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean up predictions and labels
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Compute ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Extract ROUGE scores
    result = {key: value * 100 for key, value in result.items()}

    # Add average length of predictions
    prediction_lens = [len(tokenizer.encode(pred)) for pred in decoded_preds]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Data collator for dynamic padding
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer=summarize_tokenizer,
    model=summrize_model,
    padding=True,
    return_tensors="pt"
)

In [None]:
from transformers import TrainingArguments,Trainer
training_args = TrainingArguments(
    output_dir="./pegasus-samsum",           # Output directory
    eval_strategy="steps",              # Evaluate every N steps
    eval_steps=500,                          # Evaluation frequency
    logging_strategy="steps",                # Log every N steps
    logging_steps=100,                       # Logging frequency
    save_strategy="steps",                   # Save every N steps
    save_steps=500,                          # Save frequency
    save_total_limit=3,                      # Keep only 3 checkpoints
    load_best_model_at_end=True,            # Load best model at end
    metric_for_best_model="rouge1",         # Metric to determine best model
    greater_is_better=True,                 # Higher ROUGE is better

    # Training hyperparameters - CUSTOMIZE THESE
    num_train_epochs=3,                     # Start with 1-2 epochs for testing
    per_device_train_batch_size=2,          # Reduce to 1-2 if GPU memory is limited
    per_device_eval_batch_size=2,           # Match train batch size
    gradient_accumulation_steps=8,          # Increase if reducing batch size
    warmup_steps=500,                       # 10% of total steps is good
    learning_rate=5e-5,                     # Try 3e-5 or 1e-4 if convergence is slow
    weight_decay=0.01,                      # Regularization

    # Memory optimization - IMPORTANT FOR LIMITED GPU MEMORY
    fp16=True,                              # Use mixed precision (saves ~50% memory)
    gradient_checkpointing=True,            # Trade compute for memory
    dataloader_pin_memory=False,            # Disable if having memory issues
    remove_unused_columns=False,            # Keep all columns

    # Logging and reporting
    report_to="none",                       # Change to "wandb" if you want tracking
    run_name="pegasus-samsum-finetuning",   # Run name
)

In [None]:
summarize_trainer = Trainer(
    model=summrize_model,
    args=training_args,
    train_dataset=tokenized_dataset_test,
    eval_dataset=tokenized_dataset_validation,
    tokenizer=summarize_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
summarize_trainer.train()

In [None]:
from transformers import pipeline
tester = pipeline(task="summarization",model=summrize_model,tokenizer=summarize_tokenizer)
tester.predict("Olivia: Who are you voting for in this election? \nOliver: Liberals as always.\nOlivia: Me too!!\nOliver: Great")

In [None]:
data["train"]["dialogue"][1]