In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer  
import torch

device = "cuda"  if torch.cuda.is_available() else "cpu"
device

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("knkarthick/samsum")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})

In [3]:
split_lengths = [len(ds[split]) for split in ds]
print(f"Split sizes: {split_lengths}")

print(f"Features: {ds["train"].column_names}")

print("\nDialogue:")
print(ds["test"][1]["dialogue"])
print("\nSummary:")
print(ds["test"][1]["summary"])

Split sizes: [14731, 818, 819]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [4]:
def convert_example_to_features(example_batch):
    """
    Converts a batch of raw dialogue + summary text into
    tokenized tensors that a Seq2Seq transformer model can train on.

    Input:
        example_batch -> A dictionary containing:
            {
                "dialogue": [list of dialogue strings],
                "summary": [list of summary strings]
            }

    Output:
        A dictionary containing PyTorch tensors:
            {
                "input_ids": token IDs for encoder,
                "attention_mask": mask to ignore padding,
                "labels": token IDs for decoder target
            }
    """

    # -------------------------
    # TOKENIZE THE INPUT TEXT
    # -------------------------

    # Convert dialogue text into numerical token IDs using the tokenizer.
    # The tokenizer:
    #   - Splits text into subword tokens
    #   - Converts tokens into vocabulary indices (numbers)
    #   - Pads/truncates sequences to fixed length
    input_encodings = tokenizer(
        example_batch['dialogue'],   # Raw dialogue text
        padding='max_length',        # Pad all sequences to max_length (ensures fixed tensor size)
        truncation=True,             # If dialogue is longer than max_length, cut it off
        max_length=1024,             # Maximum number of tokens allowed for encoder input
        return_tensors='pt',         # Return PyTorch tensors instead of Python lists
        return_attention_mask=True   # Generate attention mask (1 = real token, 0 = padding)
    )

    # input_encodings now contains:
    # {
    #   "input_ids": tensor(batch_size, 1024),
    #   "attention_mask": tensor(batch_size, 1024)
    # }


    # -------------------------
    # TOKENIZE THE TARGET TEXT (SUMMARY)
    # -------------------------

    # Switch tokenizer into "target mode".
    # This ensures correct behavior for seq2seq models when encoding labels.
    # (Important for models like PEGASUS, T5, etc.)
    with tokenizer.as_target_tokenizer():

        target_encodings = tokenizer(
            example_batch['summary'],  # Ground-truth summaries
            padding='max_length',      # Pad to fixed length
            truncation=True,           # Truncate if summary too long
            max_length=128,            # Max length for summary (shorter than dialogue)
            return_tensors='pt',       # Return PyTorch tensors
            return_attention_mask=True # Mask for summary tokens (not always required later)
        )

    # target_encodings contains:
    # {
    #   "input_ids": tensor(batch_size, 128),
    #   "attention_mask": tensor(batch_size, 128)
    # }


    # -------------------------
    # RETURN MODEL-READY FEATURES
    # -------------------------

    # We return a dictionary structured exactly the way
    # Hugging Face Seq2Seq models expect during training.

    # input_ids       -> Encoder input (dialogue tokens)
    # attention_mask  -> Tells model which tokens are real (1) vs padding (0)
    # labels          -> Decoder target tokens (what the model must learn to predict)

    return {
        'input_ids': input_encodings['input_ids'],        # Token IDs for encoder
        'attention_mask': input_encodings['attention_mask'],  # Ignore padding during attention
        'labels': target_encodings['input_ids'],          # Target summary token IDs (used for loss calculation)
    }


In [5]:
ds_samsum_pt = ds.map(convert_example_to_features, batched= True)

Map: 100%|██████████| 818/818 [00:00<00:00, 970.15 examples/s]


In [6]:
ds_samsum_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
})

In [7]:
# Training

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
	output_dir="pegasus-samsum",			# Folder where model checkpoints, logs, and outputs will be saved
	num_train_epochs=1,					# Number of full passes over the training dataset (1 epoch = entire dataset once)
	warmup_steps=500,					# Gradually increases learning rate for first 500 steps to stabilize training
	per_device_train_batch_size=1,			# Number of training samples processed per device (GPU/CPU) at once
	per_device_eval_batch_size=1,			# Batch size used during evaluation
	weight_decay=0.01,					# L2 regularization to reduce overfitting by penalizing large weights
	logging_steps=10,					# Log training metrics (like loss) every 10 steps
	eval_strategy="steps",				# Run evaluation during training at fixed step intervals
	eval_steps=500,					# Perform evaluation every 500 training steps
	save_steps=1e6,					# Save model checkpoint every 1,000,000 steps (effectively disables frequent saving)
	gradient_accumulation_steps=16		# Accumulate gradients over 16 steps before updating weights (simulates larger batch size)
)


trainer = Trainer(
	model=model_pegasus,				# The pretrained PEGASUS Seq2Seq model being fine-tuned
	args=training_args,				# Training configuration defined above
	tokenizer=tokenizer,				# Tokenizer used for preprocessing and decoding text
	data_collator=seq2seq_data_collator,	# Handles dynamic padding and proper batch formatting
	train_dataset=ds_samsum_pt['test'],	# Dataset used for training (normally should be 'train' split)
	eval_dataset=ds_samsum_pt['validation']	# Dataset used for evaluation during training
)


  trainer = Trainer(


In [10]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 