In [1]:
! pip install transformers

[0m

In [2]:
! pip install datasets transformers[sentencepiece]

[0m

In [1]:
# ====== Import Required Libraries ======
import os

# Important: prevent TensorFlow crash
os.environ["USE_TF"] = "0"

from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

from datasets import load_dataset, DatasetDict



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# 1. Load the CSV
dataset = load_dataset('csv', data_files=r'D:\Datascience\DL\DL Projects\End_to_End_Text_Summarizer\research\artifacts\data_ingestion\samsum-test.csv')

# 2. Split into train and test
dataset_split = dataset['train'].train_test_split(test_size=0.1, seed=42)

dataset_split = DatasetDict({
    "train": dataset_split["train"],
    "test": dataset_split["test"]
})

In [3]:


# 3. Load tokenizer and model
model_name = "google/pegasus-cnn_dailymail"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:

# 4. Preprocessing function
def preprocess_function(examples):
    inputs = examples["dialogue"]  # ðŸ‘ˆ make sure your CSV has 'dialogue' and 'summary' columns
    targets = examples["summary"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:

# 5. Tokenize datasets
tokenized_datasets = dataset_split.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_split["train"].column_names
)

In [None]:

# 6. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 7. Training arguments
training_args = TrainingArguments(
   output_dir="./pegasus_samsum_model",
    # evaluation_strategy="steps",  # This is correct for version 4.5
    eval_steps=500,  # Evaluate every 500 steps
    save_steps=1000,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    fp16=True,  # <--- this saves 50% memory
    dataloader_pin_memory=False,  # <--- small trick to reduce GPU RAM
    gradient_accumulation_steps=8,
    report_to="none"
)

# 8. Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 9. Start training
trainer.train()


In [None]:
#10. Evaluate the model
metrics = trainer.evaluate()
print("Evaluation metrics:", metrics)

# Save the model and tokenizer
model.save_pretrained("./pegasus_samsum_model/final_model")
tokenizer.save_pretrained("./pegasus_samsum_model/final_model")


# prediction 

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "./pegasus_samsum_model/final_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [13]:
# 2. Create a prediction function
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
    summary_ids = model.generate(**inputs, max_length=100, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [26]:
# 3. Predict for a single text

text = "Gandhi was not just a political leader; he was also a social reformer. He fought against untouchability, promoted village industries and self-reliance, and emphasized the importance of simplicity and humility. His belief in the power of individual action and the importance of moral conduct made him a powerful force for social change"

summary = generate_summary(text)
print("Summary:", summary)


Summary: Gandhi's belief in the power of individual action and the importance of moral conduct made him a powerful force for social change.<n>He fought against untouchability, promoted village industries and self-reliance.


In [None]:
#4. Predict for a full dataset (like a CSV file)

import pandas as pd
from tqdm import tqdm

# Load your new dataset
df = pd.read_csv('new_data.csv')

# Create a new column for summaries
summaries = []

for text in tqdm(df['text']):
    summary = generate_summary(text)
    summaries.append(summary)

df['generated_summary'] = summaries

# Save to a new CSV
df.to_csv('new_data_with_summaries.csv', index=False)
