In [27]:
!nvidia-smi

Thu Sep 19 15:50:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0              31W /  70W |  12909MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [28]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [1]:
!pip install datasets



In [2]:
# load_metric is not there anymore in datasets package, its in evaluate package
!pip install evaluate



In [3]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Found existing installation: accelerate 0.34.2
Uninstalling accelerate-0.34.2:
  Successfully uninstalled accelerate-0.34.2
Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting accelerate
  Using cached accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Using cached accelerate-0.34.2-py3-none-any.whl (324 kB)
Installing collected packages: accelerate, transformers
Successfully installed accelerate-0.34.2 transformers-4.44.2


In [1]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
model = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
!p7zip -d /content/Samsum_dataset.7z


/usr/bin/p7zip: cannot read /content/Samsum_dataset.7z


In [5]:
from datasets import load_dataset

In [6]:
dataset_samsum = load_dataset("json", data_files={
    "train": "/content/Samsum_dataset/train/train.json",
    "test": "/content/Samsum_dataset/test/test.json",
    "validation": "/content/Samsum_dataset/validation/val.json"
})

In [7]:
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'summary', 'dialogue'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'summary', 'dialogue'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'summary', 'dialogue'],
        num_rows: 818
    })
})

In [8]:
splits = [len(dataset_samsum[split]) for split in dataset_samsum]
print(f"Splits are : {splits}")

print(f"Features : {dataset_samsum['train'].column_names}")

print(f" Checking 1 dialog : {dataset_samsum['test'][1]['dialogue']}")
print("Printing the summary --------")
print(dataset_samsum['test'][1]['summary'])

Splits are : [14732, 819, 818]
Features : ['id', 'summary', 'dialogue']
 Checking 1 dialog : Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)
Printing the summary --------
Eric and Rob are going to watch a stand-up on youtube.


In [9]:
def convert_examples_to_features(example_batch):
  input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

  with tokenizer.as_target_tokenizer():
    target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True)

  return {
      'input_ids': input_encodings['input_ids'],
      'attention_mask': input_encodings['attention_mask'],
      'labels': target_encodings['input_ids']
  }

In [10]:
dataset_samsum_pt = dataset_samsum.map( convert_examples_to_features, batched=True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]



In [11]:
dataset_samsum_pt['train']

Dataset({
    features: ['id', 'summary', 'dialogue', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [12]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [13]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, evaluation_strategy='steps',
    eval_steps=500, save_steps=1e6, gradient_accumulation_steps=16
)



In [14]:
trainer = Trainer(args=trainer_args, model=model_pegasus, tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt['test'],
                  eval_dataset = dataset_samsum_pt['validation'])

In [15]:
trainer.train()

Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


TrainOutput(global_step=51, training_loss=3.0044142264945832, metrics={'train_runtime': 255.0282, 'train_samples_per_second': 3.211, 'train_steps_per_second': 0.2, 'total_flos': 313450454089728.0, 'train_loss': 3.0044142264945832, 'epoch': 0.9963369963369964})

In [16]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
  """split the dataset into smaller batches that we can process simultaneously
  Yield successive batch-sized chunks from list_of_elements."""
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i : i + batch_size]

In [17]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
  article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
  target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

  for article_batch, target_batch in tqdm(
      zip(article_batches, target_batches), total=len(article_batches)):

      inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

      summaries = model.generate(input_ids=inputs["input_ids"].to(device),attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
      ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

      decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
                for s in summaries]
      decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

      metric.add_batch(predictions=decoded_summaries, references=target_batch)

  score = metric.compute()
  return score

In [21]:
import evaluate

In [22]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [24]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn]) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 5/5 [00:14<00:00,  2.98s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.021067,0.0,0.021022,0.021051


In [25]:
## Save the model
model_pegasus.save_pretrained("pegasus-samsum-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [26]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [27]:
## Loading Tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
#Prediction

gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

sample_text = dataset_samsum["test"][0]["dialogue"]
reference = dataset_samsum["test"][0]["summary"]

pipe = pipeline("summarization", model="pegasus-samsum-model",tokenizer=tokenizer)

print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])