In [1]:
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW

import numpy as np

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
import pytorch_lightning as pl

from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import GPT2LMHeadModel
from datasets import load_dataset, Dataset, DatasetDict


## Data

In [2]:
class XSumPreprocessor:
    def __init__(self, tokenizer, max_input_length, max_target_length, prefix='summarize'):
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.prefix = prefix
  
    def preprocess(self, examples):
        # encode the code-docstring pairs
        texts = examples['document']
        summaries = examples['summary']
        
        inputs = [self.prefix + text for text in texts]
        model_inputs = self.tokenizer(inputs, max_length=self.max_input_length, padding="max_length", truncation=True)

        # encode the summaries
        labels = self.tokenizer(summaries, max_length=self.max_target_length, padding="max_length", truncation=True).input_ids

        # important: we need to replace the index of the padding tokens by -100
        # such that they are not taken into account by the CrossEntropyLoss
        labels_with_ignore_index = []
        for labels_example in labels:
            labels_example = [label if label != 0 else -100 for label in labels_example]
            labels_with_ignore_index.append(labels_example)
        
        model_inputs["labels"] = labels_with_ignore_index

        return model_inputs

use_percent = 1
dataset_train = load_dataset("xsum", split=f"train[:{use_percent}%]")
dataset_val = load_dataset("xsum", split=f"validation[:{use_percent}%]")
dataset_test = load_dataset("xsum", split=f"test[:{use_percent}%]")
dataset = DatasetDict({'train': dataset_train, 'validation': dataset_val, 'test': dataset_test})
# dataset = load_dataset("xsum")
print(dataset["train"][0])
print(len(dataset["train"]))

  

Found cached dataset xsum (/home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)
Found cached dataset xsum (/home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)
Found cached dataset xsum (/home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


2040


In [3]:
max_input_length = 512
max_target_length = 512
prefix = "summarize"
model_name = "gpt2"

# tokenizer=AutoTokenizer.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

preprocessor = XSumPreprocessor(tokenizer=tokenizer,
                                max_input_length=max_input_length,
                                max_target_length=max_target_length,
                                prefix=prefix)
processor = preprocessor.preprocess
dataset = dataset.map(processor, batched=True)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71/cache-399a9bff0dfdd941.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71/cache-7f0779d27b28a014.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71/cache-0810c884188984ff.arrow


## Model

In [4]:
class GPT2PreTrained(pl.LightningModule):
    def __init__(self, lr=5e-5, max_epochs=10, warmup_steps=1000):
        super().__init__()
        self.model = GPT2LMHeadModel.from_pretrained("gpt2")
        self.save_hyperparameters()
        self.val_dataloader_ = None
        self.test_dataloader_ = None
        self.train_dataloader_ = None

    def forward(self, input_ids, labels, attention_mask):     
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.model.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        training_steps = self.hparams.max_epochs * len(self.train_dataloader_)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=training_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def set_train_dataloader(self, train_dataloader):
        self.train_dataloader_ = train_dataloader

    def set_valid_dataloader(self, valid_dataloader):
        self.val_dataloader_ = valid_dataloader

    def set_test_dataloader(self, test_dataloader):
        self.test_dataloader_ = test_dataloader

    def train_dataloader(self):
        return self.train_dataloader_
    
    def val_dataloader(self):
        return self.val_dataloader_
    
    def test_dataloader(self):
        return self.test_dataloader_


## Training

In [5]:
# hyperparameters
batch_size = 4
learning_rate = 5e-5
max_epochs = 1

dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

num_workers = 4
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, num_workers=num_workers)
valid_dataloader = DataLoader(dataset['validation'], batch_size=batch_size, num_workers=num_workers)
test_dataloader = DataLoader(dataset['test'], batch_size=batch_size, num_workers=num_workers)

early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor()
checkpoint_callback = ModelCheckpoint(dirpath='./saved/checkpoints/', monitor='validation_loss', mode='min')
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
trainer = Trainer(accelerator=accelerator,
                  #default_root_dir="/content/drive/MyDrive/T5/Notebooks/Checkpoints", 
                  callbacks=[early_stop_callback, lr_monitor, checkpoint_callback], max_epochs=max_epochs)


model = GPT2PreTrained(lr=learning_rate)
model.set_train_dataloader(train_dataloader)
model.set_valid_dataloader(valid_dataloader)
model.set_test_dataloader(test_dataloader)

trainer.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [6]:
model.model.save_pretrained('./saved/models/')

In [7]:
gpt2_summarizer = GPT2LMHeadModel.from_pretrained('./saved/models/')

In [12]:
text = dataset['test'][0]
encoded_input = text['input_ids']
label = text[\
output = model(encoded_input)

TypeError: forward() missing 2 required positional arguments: 'labels' and 'attention_mask'