In [None]:
! pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
! pip install --upgrade accelerate
! pip uninstall -y transformers accelerate
! pip install transformers accelerate

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch
nltk.download("punkt")

from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

In [None]:
model = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)

In [None]:
from datasets import load_dataset

dataset_samsum = load_dataset("samsum",trust_remote_code=True)

print(dataset_samsum)


In [8]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)


In [None]:
print(dataset_samsum_pt)


In [None]:
train_data = dataset_samsum_pt["train"]

print(train_data[0])

for i in range(5):
    print(train_data[i])


In [13]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
from transformers import TrainingArguments, Trainer
import os


trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', 
    num_train_epochs=1, 
    warmup_steps=500,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    evaluation_strategy='steps', 
    eval_steps=500,  
    save_steps=1e6,
    gradient_accumulation_steps=16
)

trainer = Trainer(
    model=model_pegasus,  
    args=trainer_args,
    tokenizer=tokenizer,  
    data_collator=seq2seq_data_collator,  
    train_dataset=dataset_samsum_pt["train"],  
    eval_dataset=dataset_samsum_pt["validation"]  
)

trainer.train()

model_pegasus.save_pretrained(os.path.join("pegasus-samsum-model"))
tokenizer.save_pretrained(os.path.join("tokenizer"))


In [None]:
def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        """
        split the dataset into smaller batches that we can process simultaneously
        Yield successive batch-sized chunks from list_of_elements.
        """
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]



def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
                                batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu",
                                column_text="article",
                                column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):

            inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                            padding="max_length", return_tensors="pt")

            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                            attention_mask=inputs["attention_mask"].to(device),
                            length_penalty=0.8, num_beams=8, max_length=128)
            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                    clean_up_tokenization_spaces=True)
                for s in summaries]

            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
            metric.add_batch(predictions=decoded_summaries, references=target_batch)

        score = metric.compute()
        return score

def evaluation(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
       
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
        rouge_metric = load_metric('rouge')
        score = self.calculate_metric_on_test_ds(
        dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary')

        rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

        return rouge_dict

In [28]:
from transformers import AutoTokenizer
from transformers import pipeline

def predict(text):
    tokenizer = AutoTokenizer.from_pretrained(r'C:\Users\musta\OneDrive\Desktop\Abstractive-text summarizer\tokenizer')
    model = r'C:\Users\musta\OneDrive\Desktop\Abstractive-text summarizer\pegasus-samsum-model'

    kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 50}

    Pipeline = pipeline("summarization", model= model, tokenizer=tokenizer)

    summary = Pipeline(text, **kwargs)[0]["summary_text"]
    return summary

In [None]:
sample_text = """
In today's fast-paced world, the importance of time management cannot be overstated. 
As we juggle multiple responsibilities, effectively managing our time allows us to prioritize tasks, reduce stress, 
and increase productivity. Good time management involves planning, setting clear goals, and allocating appropriate time to each task. 
Many people struggle with procrastination, but overcoming this challenge requires discipline and focus. 
By being mindful of how we spend our time, we can achieve a better work-life balance and pursue our long-term goals with greater efficiency.
"""

summary = predict(sample_text)

print("Summary:")
print(summary)