In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!pip install -qU torch transformers datasets tqdm sentencepiece accelerate>=0.20.1 rouge rouge_score

In [11]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_metric
from tqdm import tqdm

In [12]:
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-cnn_dailymail')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-cnn_dailymail')

In [13]:
model.resize_token_embeddings(len(tokenizer))

Embedding(96103, 1024, padding_idx=0)

## Loading the Dataset

In [14]:
from torch.utils.data import Dataset
class NewsDataset(Dataset):
  def __init__(self, input_ids, attention_masks):
    super().__init__()
    self.input_ids = input_ids
    self.attention_masks = attention_masks
  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self, index):
    return self.input_ids[index], self.attention_masks[index]

In [15]:
path = '/content/drive/MyDrive/NewsNebula/my_dataset.pt'
dataset = torch.load(path)

## Train/Test Split data

In [16]:
train_size = int(0.8 * len(dataset))

train_data, val_data = random_split(dataset, [train_size, len(dataset) - train_size])

In [56]:
len(train_data[12][0])

500

In [25]:
ans = tokenizer.batch_decode(val_data[1][0], attention_mask=val_data[1][1])
' '.join(ans)

"senior culture reporter, huffpostharry styles may be in the midst of a hollywood publicity tour and a grueling concert residency, but he still found time to honor his homeland’s late monarch, queen elizabeth ii, with an emotional tribute. the british pop star paused his concert at new york’s madison square garden thursday to acknowledge the queen, who died earlier that day at age 96. “there’s some very sad news today: the passing of queen elizabeth ii,” styles told the crowd, as seen in footage shared by iheart radio. “please join me in a round of applause for 70 years of service.”a post shared by iheartradio (@iheartradio)styles, who was born in redditch, england, has dominated global headlines this week. in addition to his sold-out “harry’s house” concert residency, he’s set to appear on the big screen alongside chris pine and florence pugh in “don’t worry darling,” which hits theaters later this month. “don’t worry darling” premiered to major fanfare at the 2022 venice film festiva

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to('device')

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_no

In [27]:
# Here I will pass the output directory where
# the model predictions and checkpoints will be stored,
# batch sizes for the training and validation steps,
# and warmup_steps to gradually increase the learning rate


torch.backends.cudnn.enabled = True
learning_rate = 5e-5



training_args = TrainingArguments(output_dir=f'./results_{learning_rate}',
                                  num_train_epochs=10,
                                  logging_steps=1000,
                                  save_steps=1000,
                                  evaluation_strategy='steps',
                                  eval_steps=1000,
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  warmup_steps=100,
                                  learning_rate=learning_rate,
                                  weight_decay=0.01,
                                  gradient_accumulation_steps=500,
                                  logging_dir=f'./logs_{learning_rate}')

trainer = Trainer(model=model, args=training_args,
                  train_dataset=train_data,
                  eval_dataset=val_data,
                  # This custom collate function is necessary
                  # to built batches of data
                  data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
              'attention_mask': torch.stack([f[1] for f in data]),
              'labels': torch.stack([f[0] for f in data])})

# Start training process!
print(f"Training result for learning rate: {learning_rate}")
trainer.train()
print("\n\n")

Training result for learning rate: 5e-05




Step,Training Loss,Validation Loss







## Evaluate Model : Rougue Metric

In [59]:
import nltk
nltk.download('punkt')
from rouge import Rouge
import numpy as np
import os
from nltk import word_tokenize
def rougue_score(input_ids, attention_mask):
  text = tokenizer.batch_decode([input_ids], attention_mask=[attention_mask], skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
  words = word_tokenize(text)
  length_of_text = len(words)
  summary_ids = model.generate(torch.tensor([input_ids]).to(device), min_length=int(0.20*length_of_text), max_length=int(0.5*length_of_text))
  summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
  print(f"Summary -> {summary}")
  rouge = Rouge()
  score = rouge.get_scores(summary, text)
  return(score[0]['rouge-l']['f'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
training_rouge_score = []
for item in train_data:
  input_ids = item[0].tolist()
  attention_mask = item[1].tolist()
  r_score = rougue_score(input_ids, attention_mask)
  training_rouge_score.append(r_score)

Summary -> Major league baseball players association plans to join the afl-cio labor federation.<n>The afl-cio includes 57 other unions representing more than 12 million workers.<n>The baseball players union joins the federation at a time when it’s trying to organize workers.<n>The mlbpa hopes to “strengthen our player fraternity” by organizing minor leaguers and improving their working conditions.<n>The potential bargaining unit in the minor leagues would include more than 5,000 players, a large organizing effort for the mlbpa.
Summary -> Class action lawsuit says the city has violated the americans with disabilities act by allowing homeless people’s tents to block city sidewalks.<n>plaintiff steve Jackson, 47, is legally blind and uses a cane to walk. he said tents prevent him from navigating the sidewalk and accessing bus stops.<n>Plaintiffs include nine people with disabilities and a caretaker.<n>Suit seeks to require the city to clear all sidewalks of tent encampments and debris.


In [61]:
avg_training_score = np.mean(training_rouge_score)
avg_training_score

0.4443830344511001

In [62]:
testing_rouge_score = []
for item in val_data:
  input_ids = item[0].tolist()
  attention_mask = item[1].tolist()
  r_score = rougue_score(input_ids, attention_mask)
  testing_rouge_score.append(r_score)

Summary -> The Atlantic hurricane season’s sixth named storm, was predicted to bring 5 to 10 inches (13 to 25 centimeters) of rain in eastern and southern puerto rico, with as much as 16 inches (41 centimeters) in isolated spots.<n>Life-threatening surf also was possible from fiona’s winds, forecasters said.<n>Authorities in the eastern caribbean islands canceled school and prohibited people from practicing aquatic sports as fiona battered the region.in the caribbean island of guadeloupe, authorities said they recorded wind gusts of up to 74 mph (120 kph), which would be considered a category 1 hurricane. they also said 9 inches (23 centimeters) of rain fell in three hours in the gros morne area.fiona is the atlantic hurricane season’s sixth named storm, was predicted to bring 5 to 10 inches (13 to 25 centimeters) of rain in eastern and southern puerto rico, with as much as 16 inches (41 centimeters) in isolated spots.
Summary -> British pop star paused his concert at new york’s madiso

In [63]:
avg_testing_score = np.mean(testing_rouge_score)
avg_testing_score

0.4008727803608208

In [64]:
path = '/content/drive/MyDrive/NewsNebula/model'
trainer.save_model(path)
tokenizer.save_pretrained(path)

('/content/drive/MyDrive/NewsNebula/model/tokenizer_config.json',
 '/content/drive/MyDrive/NewsNebula/model/special_tokens_map.json',
 '/content/drive/MyDrive/NewsNebula/model/spiece.model',
 '/content/drive/MyDrive/NewsNebula/model/added_tokens.json')