<a href="https://colab.research.google.com/github/ravinnd3/Generative-AI-Full-Course/blob/main/Text_summarisation_using_HuggingFace_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
!pip install -q transformers[sentencepiece] datasets sacrebleu rouge_score py7zr

In [None]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate evaluate

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
# from evaluate import load_metric
import torch
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

from tqdm import tqdm
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
dataset_samsum = load_dataset("knkarthick/samsum")
dataset_samsum
#

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(
        example_batch["dialogue"], max_length=1024, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(
            example_batch["summary"], max_length=128, truncation=True
        )

        return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"],
    }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)

In [None]:
dataset_samsum_pt['train']

In [None]:
dataset_samsum_pt['train']['input_ids'][0]


In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)


In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir="pegasus-samsum", num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    do_eval=True, # Ensure evaluation is enabled
    # evaluation_strategy="steps", # This argument caused the TypeError
    # eval_steps=500,              # This argument is tied to evaluation_strategy
    save_steps=1000000, # Changed from 1e6 (float) to 1000000 (int)
    gradient_accumulation_steps=16
)

In [None]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  processing_class=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

In [None]:
trainer.train()

In [None]:
#Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_meric_on_train_eval_data(dataset, metric, model, tokenizer,
                                       batch_size=16, device=device,
                                       column_text="article",
                                       column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                           padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_masks"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)
        #decode the generated texts,
        #replace the token, and add the decoded texts with the refrence to the metric

        decode_summries = [d.replace("<pad>", " ") for d in tokenizer.batch_decode(summaries, skip_special_tokens=True)]

        decode_summries = [d.replace("?", " ") for d in decode_summries]

        metric.add_batch(predictions=decode_summries, references=target_batch)

    score = metric.compute()
    return score

In [None]:
rouge_names = ["rouge1","rouge2","rougeL","rougeLsum"]
rouge_metric = load_metric('rouge')