In [None]:
!nvidia-smi

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
!pip install evaluate

In [None]:
import evaluate
from transformers import pipeline,set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd
# from evaluate import load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
model_progress = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
dataset_samsum = load_dataset("knkarthick/samsum")

In [None]:
dataset_samsum

In [None]:
dataset_samsum['train']['dialogue'][0]

"Amanda: I baked  cookies. Do you want some?\nJerry: Sure!\nAmanda: I'll bring you tomorrow :-)"

In [None]:
dataset_samsum['train']['summary'][0]


'Amanda baked cookies and will bring Jerry some tomorrow.'

In [None]:
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue: ")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary: ")
print(dataset_samsum["test"][1]["summary"])

In [None]:
# Step 1: Filter out non-string or empty dialogues/summaries
def filter_bad_examples(example):
    return isinstance(example['dialogue'], str) and isinstance(example['summary'], str) and \
           len(example['dialogue'].strip()) > 0 and len(example['summary'].strip()) > 0

dataset_samsum_clean = dataset_samsum.filter(filter_bad_examples)

# Step 2: Define tokenization function (no need to filter inside)
def convert_examples_to_features(example_batch):
    # Tokenize inputs
    model_inputs = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True, padding='max_length')

    # Tokenize summaries as labels
    labels = tokenizer(text_target=example_batch['summary'], max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# Step 3: Map over the cleaned dataset
dataset_samsum_pt = dataset_samsum_clean.map(convert_examples_to_features, batched=True)


In [None]:
dataset_samsum_pt['train']['dialogue'][0]

"Amanda: I baked  cookies. Do you want some?\nJerry: Sure!\nAmanda: I'll bring you tomorrow :-)"

In [None]:
dataset_samsum_pt['train']

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14731
})

In [None]:
dataset_samsum_pt['train']['attention_mask'][1]

In [None]:
# Training

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_progress)

In [None]:
import transformers
print(transformers.__file__)

/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [None]:
from transformers import TrainingArguments, Trainer
import transformers
training_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    eval_strategy='steps', eval_steps=500, save_steps=int(1e6),
    gradient_accumulation_steps=16
)

In [None]:
trainer = Trainer (model = model_progress, args = training_args ,
                   tokenizer = tokenizer , data_collator = seq2seq_data_collator,
                   train_dataset = dataset_samsum_pt['test'],
                   eval_dataset=dataset_samsum_pt['validation'])

  trainer = Trainer (model = model_progress, args = training_args ,


In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mriyanjamil220[0m ([33mriyanjamil220-rj[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss




TrainOutput(global_step=52, training_loss=10.502341490525465, metrics={'train_runtime': 950.4835, 'train_samples_per_second': 0.862, 'train_steps_per_second': 0.055, 'total_flos': 2366471355236352.0, 'train_loss': 10.502341490525465, 'epoch': 1.0})

In [None]:
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements"""

    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)

        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally we decode the generated texts,
        # Replace the token and add the decoded texts with the reference to the metric
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = evaluate.load('rouge')

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary')

rouge_dict = {rn: score[rn] for rn in rouge_names}

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 5/5 [00:30<00:00,  6.01s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.018969,0.0,0.018464,0.018721


In [None]:
## Save the model
model_progress.save_pretrained("pegasus-samsum-model")


In [None]:
## Save the tokenizer

tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
# load

tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")


In [None]:
# Prediction
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

sample_text = dataset_samsum["test"][0]["dialogue"]

reference = dataset_samsum["test"][0]["summary"]

pipe = pipeline("summarization", model="pegasus-samsum-model",tokenizer=tokenizer)

print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Device set to use cuda:0
Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary:
Amanda: Ask Larry Amanda: He called her last time we were at the park together .<n>Hannah: I'd rather you texted him .<n>Amanda: Just text him .
