In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import sys
sys.path.insert(0,'../')

In [5]:
from mllib.new_bert import *
from runs.callbacks import *
from runs.data_utils import *

# Bert Example run

In [6]:
import string
import random
from sklearn.model_selection import train_test_split


def random_examples(n_examples, n_largest):
    letters = string.ascii_lowercase
    train_x = []
    train_y = []
    for i in range(n_examples):
        l = random.choice(range(1,n_largest+1))
        x = ''.join(random.choice(letters) for i in range(l))
        y = ':'+ x[::-1]
        yield x,y
        
data =[[x,y] for x,y in random_examples(10000,10)]
raw_data={}
raw_data['train'], raw_data['test'] = train_test_split(data, test_size=0.33, random_state = 42)

In [7]:
#tokenizer = get_tokenizer(tokenizer=None), get_tokenizer(tokenizer=None) # split tokenizer
tokenizer = list, list

ds = ReversedString(data = raw_data, tokenizer=tokenizer,split_=('train','test'))

100%|██████████| 6700/6700 [00:00<00:00, 318578.81lines/s]
100%|██████████| 6700/6700 [00:00<00:00, 314482.44lines/s]


# Pytorch Trainer

In [18]:
import torch
from torch.optim.lr_scheduler import StepLR, ExponentialLR
from torch.optim.sgd import SGD
from torch.utils.data import DataLoader, random_split
from warmup_scheduler import GradualWarmupScheduler

class LitTransformer(pl.LightningModule):
    def __init__(self, learning_rate=0.001, batch_size=4, num_workers=0):
        super().__init__()
        self.learning_rate=learning_rate
        self.batch_size = batch_size
        self.num_workers=num_workers
        
        self.loss_crit = LabelSmoothingLoss2(ignore_value = 1, label_smoothing=0.1)
        self.save_hyperparameters()

        
    def make_src_mask(self, src):
        src_mask = (src != PAD_IDX).unsqueeze(1).unsqueeze(2)
        # (N , 1, 1, src_len)
        return src_mask
    
    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(N , 1, trg_len, trg_len)
        return trg_mask
    
    def forward(self, src, trg):
        
        # get mask for src
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        return self.model.forward(src, src_mask, trg, trg_mask)
        
        
    def prepare_data(self):
        data = [[x,y] for x, y in random_examples(10000,10)]
        self.raw_data={}
        self.raw_data['train'], self.raw_data['test'] = train_test_split(data, test_size=0.33, random_state = 42)
        
    
    def setup(self, stage = None):
        tokenizer = list, list
        reversed_train, reversed_test = ReversedString(data = raw_data, tokenizer=tokenizer)
        
        # save the vocab
        self.src_vocab, self.trg_vocab = reversed_train.get_vocab()
        
        # define the model based on trg vocab. Note: We don't use src_vocab here.
        self.model = make_model(len(self.trg_vocab), len(self.trg_vocab), 
                               N=4, d_model=128, d_ff=128, h=4, dropout=0.2)
        
        self.criterion = SimpleLossCompute(self.model.generator, self.loss_crit, None)

        # train / val split
        n = len(reversed_train)
        p = int(0.8*n)
        rerversed_train, reversed_val = random_split(reversed_train, [p, n-p])
        
        # asssign to use in dataloaders
        self.train_ds = reversed_train
        self.test_ds = reversed_test
        self.val_ds = reversed_val
        
    def configure_optimizers(self):
        optim = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
    
        # scheduler_warmup is chained with schduler_steplr
        scheduler_steplr = StepLR(optim, step_size=10, gamma=0.1)
        scheduler_warmup = GradualWarmupScheduler(optim, multiplier=1, total_epoch=5, after_scheduler=scheduler_steplr)
    
        return [optim],[scheduler_warmup]
        
    
    def training_step(self, batch, batch_idx):
        src, trg = batch
        src = src.permute(1,0)
        trg = trg.permute(1,0)
        
        # pass through seq2seq model and get loss
        out =  self.forward(src,trg[:,:-1])
        loss = self.criterion(out, trg[:,1:])
        self.log('loss', loss)
        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        ret = self.training_step(batch, batch_idx)
        self.log('val_loss', ret['loss'])
        return {'val_loss': ret['loss']}
        
    def train_dataloader(self):
        dl = DataLoader(self.train_ds, self.batch_size,
                          collate_fn=generate_batch_new, num_workers=self.num_workers)
        return dl
    
    def val_dataloader(self):
        return DataLoader(self.val_ds, self.batch_size,
                          collate_fn=generate_batch_new,num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.test_ds, self.batch_size,
                          collate_fn=generate_batch_new,num_workers=self.num_workers)
        

# Run Training

In [19]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

logger = TensorBoardLogger('tb_logs', name='bert')
model = LitTransformer()

trainer = Trainer(fast_dev_run=False, progress_bar_refresh_rate=5, max_epochs=10,enable_pl_optimizer=False, 
                        callbacks=[
                            ModelTestCallback(test='puneet'), 
                            LogHistogramCallback(),
                            ModelCheckpoint(dirpath='.checkpoints/', monitor='val_loss')
                        ], logger=logger, auto_lr_find=True)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [20]:
trainer.fit(model)


100%|██████████| 6700/6700 [00:00<00:00, 370204.28lines/s]

100%|██████████| 6700/6700 [00:00<00:00, 330081.71lines/s]

  | Name      | Type                | Params
--------------------------------------------------
0 | loss_crit | LabelSmoothingLoss2 | 0     
1 | model     | EncoderDecoder      | 1.1 M 
--------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.294     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…






1

In [None]:
# mt = ModelTestCallback()
# mt.on_fit_start(trainer, trainer.model)
# mt.on_train_epoch_end(trainer,trainer.model, outputs=None)