<a href="https://colab.research.google.com/github/saurabhhumane125/newssumm-summarization/blob/main/pegasus_multidoc_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PEGASUS Fine-Tuning for Multi-Document News Summarization

This notebook fine-tunes a PEGASUS model on the NewsSumm multi-document dataset
and evaluates performance using ROUGE metrics.


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive


'Colab Notebooks'		     lv_0_20250115194547.mp4
'Contacts-2024-11-07 (3).vcf'	     new.engagement
'Contacts-2024-11-07 (4).vcf'	     newssumm
'Datereveal5 .mp4'		    'Presentation '
'Document from saurabh humane.pdf'   saurabh
 Edits


In [None]:
!ls /content/drive/MyDrive/newssumm


bart_multidoc_v1  multidoc_v1.csv


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/newssumm/multidoc_v1.csv")
print("Number of samples:", len(df))
df.head(1)


Number of samples: 306


Unnamed: 0,input_text,target_summary,num_docs,category,date
0,[DOC_1]\nNew Delhi: Bajaj Auto and TVS Motor C...,TVS Motor Company and Bajaj Auto have agreed t...,3,Automotive,2019-10-31


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset


Dataset({
    features: ['input_text', 'target_summary', 'num_docs', 'category', 'date'],
    num_rows: 306
})

In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model_name = "google/pegasus-cnn_dailymail"

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

print("PEGASUS loaded")




Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

PEGASUS loaded


In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "NO GPU")


True
Tesla T4


In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

device = "cuda"
model_name = "google/pegasus-cnn_dailymail"

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

print("PEGASUS loaded on", device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

PEGASUS loaded on cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!ls /content



drive  sample_data


In [None]:
!ls /content/drive


MyDrive


In [None]:
!find /content/drive -name "multidoc_v1.csv"


/content/drive/MyDrive/newssumm/multidoc_v1.csv


In [None]:
import pandas as pd

csv_path = "/content/drive/MyDrive/newssumm/multidoc_v1.csv"
df = pd.read_csv(csv_path)

print("Rows:", len(df))
df.head(1)


Rows: 306


Unnamed: 0,input_text,target_summary,num_docs,category,date
0,[DOC_1]\nNew Delhi: Bajaj Auto and TVS Motor C...,TVS Motor Company and Bajaj Auto have agreed t...,3,Automotive,2019-10-31


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

MAX_INPUT_LEN = 1024   # PEGASUS hard-safe limit
MAX_TARGET_LEN = 256


In [None]:
def preprocess(batch):
    inputs = tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_summary"],
            truncation=True,
            padding="max_length",
            max_length=MAX_TARGET_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [None]:
MAX_INPUT_LEN = 1024
MAX_TARGET_LEN = 256

def preprocess(batch):
    inputs = tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT_LEN
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_summary"],
            truncation=True,
            padding="max_length",
            max_length=MAX_TARGET_LEN
        )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [None]:
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names
)


Map:   0%|          | 0/306 [00:00<?, ? examples/s]



In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

train_ds = tokenized_dataset["train"]
val_ds = tokenized_dataset["test"]

print(len(train_ds), len(val_ds))


275 31


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./pegasus_multidoc",

    do_eval=True,          # ← THIS replaces evaluation_strategy
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,

    num_train_epochs=4,
    fp16=True,

    logging_steps=20,
    save_total_limit=2,
    report_to="none",
)


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [None]:
trainer.train()


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
20,6.7573
40,6.1244
60,5.9223
80,5.9054
100,5.7876
120,5.7
140,5.8474




TrainOutput(global_step=140, training_loss=6.006358555385044, metrics={'train_runtime': 1851.7481, 'train_samples_per_second': 0.594, 'train_steps_per_second': 0.076, 'total_flos': 3178410855628800.0, 'train_loss': 6.006358555385044, 'epoch': 4.0})

In [None]:
save_path = "/content/drive/MyDrive/newssumm/pegasus_multidoc_v1"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("PEGASUS model saved to:", save_path)


PEGASUS model saved to: /content/drive/MyDrive/newssumm/pegasus_multidoc_v1


In [1]:
import torch

def generate_summary_pegasus(text, max_len=128):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="longest"
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=max_len,
            num_beams=4,
            early_stopping=True
        )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [2]:
preds = []
refs = []

for i in range(len(df_eval)):
    preds.append(generate_summary_pegasus(df_eval.iloc[i]["input_text"]))
    refs.append(df_eval.iloc[i]["target_summary"])


NameError: name 'df_eval' is not defined

In [1]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/newssumm/multidoc_v1.csv")
print(len(df))
df.head(1)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/newssumm/multidoc_v1.csv'

In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)


Mounted at /content/drive


In [1]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/newssumm/multidoc_v1.csv")
print(len(df))
df.head(1)


306


Unnamed: 0,input_text,target_summary,num_docs,category,date
0,[DOC_1]\nNew Delhi: Bajaj Auto and TVS Motor C...,TVS Motor Company and Bajaj Auto have agreed t...,3,Automotive,2019-10-31


In [2]:
from sklearn.model_selection import train_test_split

df_train, df_eval = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

print(len(df_train), len(df_eval))



244 62


In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model_path = "/content/drive/MyDrive/newssumm/pegasus_multidoc_v1"

tokenizer = PegasusTokenizer.from_pretrained(model_path)
model = PegasusForConditionalGeneration.from_pretrained(model_path).to(device)

print("PEGASUS loaded from Drive")


PEGASUS loaded from Drive


In [4]:
def generate_summary_pegasus(text):
    inputs = tokenizer(
        text,
        truncation=True,
        padding="longest",
        max_length=1024,
        return_tensors="pt"
    ).to(device)

    summary_ids = model.generate(
        inputs["input_ids"],
        num_beams=4,
        max_length=128,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [5]:
preds = []
refs = []

for i in range(20):   # only 20 samples
    preds.append(generate_summary_pegasus(df_eval.iloc[i]["input_text"]))
    refs.append(df_eval.iloc[i]["target_summary"])

len(preds), len(refs)


(20, 20)

In [9]:
!pip install -q evaluate rouge-score



  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [6]:
import evaluate

rouge = evaluate.load("rouge")

pegasus_results = rouge.compute(
    predictions=preds,
    references=refs
)

pegasus_results


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.23547148981219562),
 'rouge2': np.float64(0.06706526033982646),
 'rougeL': np.float64(0.15965317793402728),
 'rougeLsum': np.float64(0.16014554490615143)}