# Pytorch Lightning


In this kernel, I use Pytorch Lightning for the whole pipeline. Pytorch Lightning is just reorganized pytorch code and on top of that, you get some extra benefits like TPU training, Distributed Data Parallel training, and many more with minimal change in code. The best part is, you dont need to learn completely new framework to use Lightning. You can look at their docusmentation for more info. https://github.com/PyTorchLightning/pytorch-lightning


The kernel has 6 sections:

1. Import
2. Load and Multilabel Stratify
3. Create Dataset class
4. Create LightningDataModule 
    * Select the fold to be used for validation
    * Use sklearn ColumnTransformer for OneHot Encoding of categorical variables
    * Use sklearn ColumnTransformer for MinMaxScaling of numerical variables
    * Create instance of Dataset class for training and validation
    * Define methods for train, val and test dataloaders
5. Create LightningModule
    * Define model
    * Define Shared Step for training and validation
    * Log results to wandb
6. Trainer
    * Use GPU or TPU for training
    * Use Wandb/Neptune/Tensorboard for logging
    * Set Epochs
    * It can be used for debugging and many more things. Pls check the documentation


# Wandb

In machine learning, it is very important to not only carry out many experiments, but to log and compare them systematically. Manual maintenance and creation of loss, metric,etc. plots is cumbersome. That is why, I use wandb. Wandb lets you log values on the fly. You can compare various experiments easily, write reports and much more. Check it out if you are interested. https://www.wandb.com/ 


Use the cfg dictionary to change variable values and experiment further. All the best!

# TPU 

There is some bug in the TPU version. I am eager to see someone make it work. Set TPU=True to try training with TPU 

In [None]:
TPU = False
DEBUG = False
NAME= 'exp01-baseline'

if TPU:
    !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
    !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
import sys
sys.path.append('../input/iterativestratification/iterative_stratification-0.1.6-py3-none-any.whl')

!pip install pytorch_lightning
!pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git
!pip install torchcontrib

# IMPORT

In [None]:
import pandas as pd
import os
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
import torch.optim.lr_scheduler as lr_scheduler

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from sklearn.compose import ColumnTransformer

from warmup_scheduler import GradualWarmupScheduler
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import random
import wandb
import torchcontrib
from torchcontrib.optim import SWA
from tqdm.notebook import tqdm
import warnings

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

if TPU:
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.parallel_loader as pll
    import torch_xla.distributed.xla_multiprocessing as xmp

warnings.simplefilter("ignore", UserWarning)

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

# LOAD AND STRATIFY

In [None]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
train['cp_time'] = train['cp_time'].astype('object')
test = pd.read_csv('../input/lish-moa/test_features.csv')
test['cp_time'] = test['cp_time'].astype('object')
targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')


target_cols = [c for c in targets.columns if c not in ['sig_id']]

In [None]:
cfg = dict(
    seed = 42,
    epochs = 1 if DEBUG else 10,
    folds = 5,
    num_features = train.shape[1]-4,
    num_targets = targets.shape[1] - 1,
    cat_feats_ohe = 7,
    hidden_size = 512, 
    dropout = 0.3,
    bs = 128,
    init_lr = 1e-4,
    lr_min = 1e-6,
    warmup_epoch = 1,
    warmup_multiplier = 10,
    cosine_epoch = 49
)

seed_everything(cfg['seed'])

In [None]:
train = train.merge(targets, on = 'sig_id')
Fold = MultilabelStratifiedKFold(n_splits=cfg['folds'], random_state=cfg['seed'])
for n, (train_index, val_index) in enumerate(Fold.split(train, train[target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)


print(train.shape)

# Dataset

In [None]:
class MoADataset(Dataset):
    def __init__(self, df, mode = 'train'):
        
        self.cat_feats = df[:, :cfg['cat_feats_ohe']]
        self.num_feats = df[:, cfg['cat_feats_ohe']:cfg['cat_feats_ohe']+cfg['num_features']]
        self.mode = mode
        
        if self.mode=='train':
            self.targets = df[:, -cfg['num_targets']:]
        
    def __len__(self):
        
        return len(self.num_feats)
    
    def __getitem__(self, idx):
        
        cat_feats = torch.tensor(self.cat_feats[idx], dtype = torch.long)
        num_feats = torch.tensor(self.num_feats[idx], dtype = torch.float)
        
        if self.mode:
            targets = torch.tensor(self.targets[idx], dtype = torch.float)
        
        if self.mode=='train':
            return cat_feats, num_feats, targets
        else:
            return cat_feats, num_feats
        

# LightningDataModule

In [None]:
class MoADataModule(pl.LightningDataModule):
    def __init__(self, train, test, fold):
        super(MoADataModule, self).__init__()
        
        self.data = train.iloc[:, 1:]
        self.test = test.iloc[:, 1:]
        self.fold = fold
        
    def setup(self, stage):
        
        if stage=='fit' or stage is None:
            
            train_X = self.data[self.data['fold']!=self.fold].reset_index(drop=True)
            val_X = self.data[self.data['fold']==self.fold].reset_index(drop=True)
            
            train_X = train_X.drop('fold', 1)
            val_X = val_X.drop('fold', 1)
            
            scaler = MinMaxScaler()
            ohe = OneHotEncoder()
            

            ct = ColumnTransformer([('onehot', ohe, [0,1,2]), 
                                    ('minmax', scaler, [i+3 for i in range(cfg['num_features'])])])
            
            train_X = ct.fit_transform(train_X)
            val_X = ct.transform(val_X)

            
            self.traindataset = MoADataset(train_X)
            self.valdataset = MoADataset(val_X)
            
        if stage=='test':
            
            test = preprocessor.transform(self.test)
            
            self.testdataset = MoADataset(test, mode='test')
            
    def train_dataloader(self):
        
        return DataLoader(self.traindataset, batch_size = cfg['bs'], shuffle = True, num_workers =  4, pin_memory = True)
    
    def val_dataloader(self):
        
        return DataLoader(self.valdataset, batch_size = cfg['bs'], shuffle = False, num_workers =  4, pin_memory = True)
    
    def test_dataloader(self):
        
        return DataLoader(self.testdataset, batch_size = cfg['bs'], shuffle = False, num_workers =  4, pin_memory = True)
            
            

In [None]:
# Fix Warmup Bug
class GradualWarmupSchedulerV2(GradualWarmupScheduler):
    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        super(GradualWarmupSchedulerV2, self).__init__(optimizer, multiplier, total_epoch, after_scheduler)
    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]
        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

# LightningModule

In [None]:
class MoAModule(pl.LightningModule):
    def __init__(self):
        super(MoAModule, self).__init__()
        
        self.mlp = nn.Sequential(
                          nn.Linear(cfg['num_features'], cfg['hidden_size']),
                          nn.BatchNorm1d(cfg['hidden_size']),
                          nn.Dropout(cfg['dropout']),
                          nn.ReLU(),
                          nn.Linear(cfg['hidden_size'], cfg['hidden_size']),
                          nn.BatchNorm1d(cfg['hidden_size']),
                          nn.Dropout(cfg['dropout']),
                          nn.ReLU(),
                          nn.Linear(cfg['hidden_size'], cfg['num_targets'])
                          )
    
    def shared_step(self, batch, batch_nb):
        
        cat_feats, num_feats, targets = batch
        logits = self.mlp(num_feats)
        loss = self.loss_func(logits, targets)
        
        return loss
    
    def training_step(self, batch, batch_nb):
        
        loss = self.shared_step(batch, batch_nb)
        result = pl.TrainResult(minimize = loss)
        result.log('train loss', loss, on_epoch = True, prog_bar = True, logger = True)
        
        return result
    
    def validation_step(self, batch, batch_nb):
        
        loss = self.shared_step(batch, batch_nb)
        result = pl.EvalResult(checkpoint_on = loss, early_stop_on = loss)
        result.log('val loss', loss, on_epoch = True, prog_bar = True, logger = True)
        
        return result
    
    def test_step(self, batch, batch_nb):
        
        cat_feats, num_feats, targets = batch
        logits = self.mlp(num_feats)
        preds = logits.sigmoid()
        
        return preds
    
    def loss_func(self, pred, target):
        return nn.BCEWithLogitsLoss()(pred,target)
    
    def configure_optimizers(self):
        
        optimizer = Adam(self.mlp.parameters(), lr = cfg['init_lr'])
#         optimizer = SWA(base_opt, swa_start = cfg['swa_start'], swa_freq = cfg['swa_freq'], swa_lr = cfg['lr_min'])
        
        cosine = lr_scheduler.CosineAnnealingLR(optimizer, T_max = cfg['cosine_epoch'], eta_min = cfg['lr_min'])
        scheduler = GradualWarmupSchedulerV2(optimizer, total_epoch = cfg['warmup_epoch'], multiplier = cfg['warmup_multiplier'], after_scheduler = cosine)
        
        return [optimizer], [scheduler]
        

# Trainer

In [None]:
wandblogger = WandbLogger(name = f'{NAME}', project = 'MoA')
wandblogger.log_hyperparams(cfg)

for i in range(cfg['folds']):
    data = MoADataModule(train, test, fold = i)
    data.setup(stage='fit')
    model = MoAModule()


    if not TPU:
        if DEBUG:
            trainer = Trainer(gpus = 1, logger = wandblogger,
                 limit_train_batches=5,
                 limit_val_batches = 5,
                    max_epochs = cfg['epochs'], default_root_dir = '/kaggle/working')
        else:
            trainer = Trainer(gpus = 1, logger = wandblogger,max_epochs = cfg['epochs'], default_root_dir = '/kaggle/working')
    else:
        if DEBUG:
            trainer = Trainer(tpu_cores = 1, logger = wandblogger,
                 limit_train_batches=5,
                 limit_val_batches = 5,
                    max_epochs = cfg['epochs'], default_root_dir = '/kaggle/working')
        else:
            trainer = Trainer(tpu_cores = 8, logger = wandblogger, default_root_dir = '/kaggle/working')
            

    wandblogger.watch(model, log = 'all')

    trainer.fit(model, data)

    trainer.save_checkpoint(f'{NAME}_fold{i}.pt')
    wandb.save(f'{NAME}_fold{i}.pt')