In [3]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [6]:
# !pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [7]:
!pip install -U accelerate datasets transformers sacrebleu rouge_score py7zr -q

In [10]:
import torch
from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Basic HF model infarence

In [None]:
# from transformers import AutoTokenizer, PegasusForConditionalGeneration
# model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
# tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

# ARTICLE_TO_SUMMARIZE = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through Wednesday."\

# inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt', truncation=True)
# # summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
# summary_ids = model.generate(inputs['input_ids'])
# tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


# Fine Tuning

In [None]:
model_ckpt = 'google/pegasus-cnn_dailymail'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [12]:
# !wget https://github.com/krishnaik06/datasets/raw/refs/heads/main/summarizer-data.zip
# !unzip summarizer-data.zip

--2025-02-01 17:52:18--  https://github.com/krishnaik06/datasets/raw/refs/heads/main/summarizer-data.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/krishnaik06/datasets/refs/heads/main/summarizer-data.zip [following]
--2025-02-01 17:52:18--  https://raw.githubusercontent.com/krishnaik06/datasets/refs/heads/main/summarizer-data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7903594 (7.5M) [application/zip]
Saving to: ‘summarizer-data.zip’


2025-02-01 17:52:19 (244 MB/s) - ‘summarizer-data.zip’ saved [7903594/7903594]

Archive:  summarizer-data.zip
  inflating: samsum-test.csv         


In [13]:
dataset_samsum = load_from_disk('../dataset/samsum_dataset')
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [15]:
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum.keys()]

print(f"Train: {split_lengths[0]}, Validation: {split_lengths[1]}, Test: {split_lengths[2]}")
print(f"Feature Names: {dataset_samsum['train'].column_names}")

print(f"Dialogue: \n{dataset_samsum['train'][1]['dialogue']}")
print(f"Summary: \n{dataset_samsum['train'][1]['summary']}")

Train: 14732, Validation: 819, Test: 818
Feature Names: ['id', 'dialogue', 'summary']
Dialogue: 
Olivia: Who are you voting for in this election? 
Oliver: Liberals as always.
Olivia: Me too!!
Oliver: Great
Summary: 
Olivia and Olivier are voting for liberals in this election. 


### Expected input from for Seq2Seq model fine-tuning

```
{
    'input_ids' : [123, 456, 789, ...]  # token ids for the dialogue (input features)
    'attention_mask': [1, 1, 1, 1, ...] # attention mask for the input
    'labels': [321, 654, 987, ...]      # Token ids for the summary (tagets)
}
```

In [16]:
def conver_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'], truncation=True, padding='max_length', max_length=1024)
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], truncation=True, padding='max_length', max_length=128)
        
    encodings = {
        'input_ids': input_encodings.input_ids,
        'attention_mask': input_encodings.attention_mask,
        'labels': target_encodings.input_ids
    }
    return encodings

In [17]:
dataset_samsum_pt = dataset_samsum.map(conver_examples_to_features, batched=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [None]:
dataset_samsum_pt['train']

In [None]:
## training the model

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='pegasus_samsum',
    num_train_epochs=1,
    warmpup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps = 10,
    logging_dir='logs',
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=999,
    gradent_accumulation_steps=16,
)

In [None]:
trainer = Trainer(
    model=model_pegasus,
    args= training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    # train_dataset=dataset_samsum_pt['train'],
    train_dataset=dataset_samsum_pt['test'],
    eval_dataset=dataset_samsum_pt['validation'],
)

In [None]:
trainer.train()

## Evaluation

In [1]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i:i+batch_size]
        
def calculate_metric_on_test_data(dataset, metric, model, tokenizers,
                                batch_size=16, device=device,column_text='article', column_summary="summary"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    summary_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))
    
    for article_batch, summary_batch in tqdm(zip(article_batches, summary_batches), total=len(article_batches)):
        # inputs = tokenizers(article_batch, max_length=1024, return_tensors='pt', truncation=True, padding=True).to(device)
        # outputs = model.generate(**inputs)
        # predictions = tokenizers.batch_decode(outputs, skip_special_tokens=True)
        # metric.add_batch(predictions=predictions, references=summary_batch)
        
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding=True, return_tensors='pt').to(device)
        
        summaries = model.generate(input_ids = inputs["input_ids"].to(device),
                                   attention_mask=inputs['attention_mask'].to(device),
                                   length_penalty=0.8, num_beams=9, max_length=128)
        """ parameter for length penalty to avoid the too long sequence to generate the model """
        
        decode_summaries = [tokenizer.decode(s, skip_special_tokens=True,clean_up_tokenization_spaces=True)
                            for s in summaries]
        decode_summaries = [d.replace('', ' ') for d in decode_summaries]
        metric.add_batch(predictions=decode_summaries, references=summary_batch)
        
    score = metric.compute()
    return score    

NameError: name 'device' is not defined

In [2]:
!pip install evaluate -q

In [None]:
import evaluate

rouge_metric = evaluate.load('rouge')
rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']

In [None]:
score = calculate_metric_on_test_data(
    dataset_samsum['train'][0:20], 
    rouge_metric, model_pegasus, tokenizer, batch_size=16, device=device, column_text='dialogue', column_summary='summary')

rouge_dict = {rn: score[rn] for rn in rouge_names}
import pandas as pd
pd.DataFrame(rouge_dict, index=['pegasus'])

In [None]:
model_pegasus.save_pretrained('pegasus-samsum-finetuned')
tokenizer.save_pretrained('pegasus-samsum-finetuned')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('pegasus-samsum-finetuned')

gen_kwargs = {'length_penalty': 0.8, 'num_beams': 9, 'max_length': 128}
sample_text = dataset_samsum['test'][0]['dialogue']
reference = dataset_samsum['test'][0]['summary']

pipe = pipeline('summarization', model='./pegasus-samsum-finetuned', tokenizer=tokenizer )
print("Dialogue: ", sample_text)
print("Summary: ", reference)

out = pipe(sample_text, **gen_kwargs)[0]['summary_text']
print("Model Summary: ", out\)