### About

I use Pytorch Lightning for building a model 
* [Triple Stratified 192x192 JPEG images from Chris Deotte](#dataloading)
* [simple augmentations](#augmentations)
* [Uses Efficientnet](#efficientnet)
* [Uses BinaryCrossEntropyWithLogits as the Loss function](#lossfunction)
* [5 fold CV](#folding)
* [AdamW optimizer with ReduceLROnPlateau scheduler](#optimscheduler)
* [GPU training with 12 epochs per fold, lr=1e-4](#training)
* [Simple avergae of 5 fold output for Submission](#submission)


In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
# import torch_xla.core.xla_model as xm
# import torch_xla.distributed.parallel_loader as pl
# import torch_xla.distributed.xla_multiprocessing as xmp

In [None]:
!pip install pytorch_lightning
# !pip uninstall -q typing --yes
# !pip install https://github.com/PytorchLightning/pytorch-lightning/archive/master.zip --upgrade
# !pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade

# Install pytorcuh-Efficientnet
!pip install git+https://github.com/krisho007/EfficientNet-PyTorch

# !pip install efficientnet-pytorch

!pip install https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader
import pytorch_lightning as ptl
from efficientnet_pytorch import EfficientNet
from pytorch_lightning.metrics.classification import AUROC
from pytorch_lightning.callbacks import EarlyStopping
import torch.nn.functional as Functional
from PIL import Image
import random
import os
import shutil
from glob import glob
import cv2
from torch.optim.lr_scheduler import ExponentialLR
from pytorch_lightning import loggers
from pytorch_lightning import _logger as log
import albumentations as A
import math
from albumentations.pytorch.transforms import ToTensorV2

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

### Load Data <a id='dataloading' />

In [None]:
train = pd.read_csv("../input/jpeg-melanoma-192x192/train.csv")

test = pd.read_csv("../input/jpeg-melanoma-192x192/test.csv")

# Creating a new column to be populated later for submission
test['target'] = 0

submission = pd.read_csv("../input/jpeg-melanoma-192x192/sample_submission.csv")

In [None]:
#Records with tfrecord = -1 => duplicate. Getrid of them
train_data = train[train.tfrecord != -1].reset_index(drop=True)

### Augmentations <a id="augmentations"></a>

In [None]:
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)

def get_train_transforms():
    return A.Compose([
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.GaussianBlur(p=0.3),
            A.Normalize(mean, std, max_pixel_value=255, always_apply=True),
            ToTensorV2(),
        ], p=1.0)

def get_valid_transforms():
    return A.Compose([
            A.Normalize(mean, std, max_pixel_value=255, always_apply=True),
            ToTensorV2(),
        ], p=1.0)

def get_tta_transforms():
    return A.Compose([
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.Normalize(mean, std, max_pixel_value=255, always_apply=True),
            ToTensorV2(),
        ], p=1.0)

### Dataset


In [None]:

class melanomaDataset(Dataset):
    def __init__(self, data, is_testing = False, image_folder = '../input/jpeg-melanoma-192x192/train', transforms=None):
        self.data = data
        self.is_testing = is_testing
        self.image_folder = image_folder
        self.transforms = transforms
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        image_path = f"{self.image_folder}/{self.data.iloc[index]['image_name']}.jpg"
        target = self.data.iloc[index]['target']
        
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        
        if self.transforms:
            sample = self.transforms(image=image)
            image  = sample['image']
            
        if self.is_testing:
            sample =  {
                "image_name": self.data.iloc[index]['image_name'],
                "image": image
            } 
        else:        
            sample = {
                "image_name": self.data.iloc[index]['image_name'],
                "image": image,
                "target": torch.tensor(target, dtype = torch.float32)
            }
            
        return sample
        

### Model <a id='efficientnet'/><a id='lossfunction'/><a id='optimscheduler'/><a id='folding'/>

In [None]:
class melanomaModel(ptl.LightningModule):
    def __init__(self, hparams):
        super(melanomaModel, self).__init__()
        self.hparams = hparams
        self.model = EfficientNet.from_pretrained('efficientnet-b5', num_classes=1)        
        
    def forward(self, x):
        return torch.squeeze(self.model(x["image"]))
        

    def getLoss(self, prediction, actual):
        loss_function = Functional.binary_cross_entropy_with_logits
        loss = loss_function(prediction, actual)
        return loss

    def prepare_data(self):
        fold = self.hparams.fold
        complete_range = list(range(15))
        validation_start_index = fold * 3
        validation_end_index = validation_start_index + 3
        validation_range = complete_range[validation_start_index:validation_end_index]
        
        df_train = train_data[~train_data.tfrecord.isin(validation_range)].reset_index(drop=True)
        df_valid = train_data[train_data.tfrecord.isin(validation_range)].reset_index(drop=True)
        df_test = test

        # Datasets
        self.train_dataset = melanomaDataset(df_train, transforms=get_train_transforms())
        self.valid_dataset = melanomaDataset(df_valid, transforms=get_valid_transforms())
        self.test_dataset = melanomaDataset(df_test, image_folder = '../input/jpeg-melanoma-192x192/test', transforms=get_tta_transforms()) 

    def train_dataloader(self):               
        training_loader = DataLoader(
            self.train_dataset, batch_size=32, num_workers=4, shuffle=True
        )        
        log.info("Training data loaded.")
        return training_loader    
    
    def training_step(self, batch, batch_index):
        # Find current output
        batch_prediction = self(batch)        
        # Find loss
        loss = self.getLoss(batch_prediction, batch["target"])
        
        return {"loss": loss}
    

    def val_dataloader(self):        
        valid_loader = DataLoader(
            self.valid_dataset, batch_size=16, num_workers=4, shuffle=False
        )
        log.info("Validation data loaded.")
        return valid_loader    

    def validation_step(self, batch, batch_index):
        # Find current output
        batch_prediction = self(batch)
        # Find loss
        loss = self.getLoss(batch_prediction, batch["target"])
        return {"val_loss": loss,
                "y" : batch["target"].detach(),
                "y_hat": batch_prediction.detach()}
    
    def validation_epoch_end(self, outputs):
        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
        y = torch.cat([x['y'] for x in outputs])
        y_hat = torch.cat([x['y_hat'] for x in outputs])
        auc = AUROC()(pred=y_hat, target=y) if y.float().mean() > 0 else 0.5 # skip sanity check
        
        # rounded with a threhold of 0.5 and compared with GT for accuracy
        acc = (y_hat.round() == y).float().mean().item()
        
        print(f"Fold: {self.hparams.fold} Epoch {self.current_epoch} auc:{auc}")
        return {'avg_val_loss': val_loss_mean,
                'val_auc': auc, 'val_acc': acc}    
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=16, num_workers=4,
                          drop_last=False, shuffle=False, pin_memory=False)      
    
    def test_step(self, batch, batch_nb):
        y_hat = self(batch).flatten()
        return {'y_hat': y_hat}

    def test_epoch_end(self, outputs):
#         import pdb; pdb.set_trace()        
        # outputs has all the output for test data 
        y_hat = torch.cat([x['y_hat'] for x in outputs])
        
        #Below line will fail if it is a fast_dev_run=True, as outputs has only one batch
        test['target'] = y_hat.tolist()
        
        # Two required columns into submission csv
        header = ["image_name","target"]
        test.to_csv(f'submission{self.hparams.fold}.csv', columns = header, index=False)
        
#         return y_hat


    def configure_optimizers(self):
        optim = torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
#         optim = Ranger(self.parameters(), lr=self.hparams['lr'])
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optim,
            patience=3,
            threshold=0.001,
            mode="max"
        )
    
        gen_sched = {
            "scheduler": scheduler,  # Explore other schedulers
            "interval": "step",  # can be 'epoch' as well. step=>batch
            "frequency": 1,
        }  # called after each training step.If not mentioned, scheduler is called after every epoch
        return {"optimizer": optim, "scheduler": gen_sched}  # Run scheduler

### Train & Test <a id='training' />

In [None]:
# Define a function to initialize and train a model

def train(fold):

    # Checkpoints
    if not os.path.exists('Checkpoints'):
        os.makedirs('Checkpoints')   

    # Hyper parameters
    hparams = {"fold":fold, "lr":1e-3}
    model = melanomaModel(hparams)
    checkpoint_callback = ptl.callbacks.ModelCheckpoint("Checkpoints/{fold:02d}_{epoch:02d}_{val_auc:.4f}",
                                                   save_top_k=1, monitor='val_auc', mode='max')    
    
    early_stop_callback = EarlyStopping(
       monitor='avg_val_loss',
       min_delta=0.00,
       patience=3,
       verbose=True,
       mode='min'
    )    
    
#     trainer = ptl.Trainer(tpu_cores=1, precision=16, max_epochs=1, fast_dev_run=False
#                           , checkpoint_callback=checkpoint_callback
# #                           , early_stop_callback=early_stop_callback
#                          )    

    trainer = ptl.Trainer(gpus=-1, max_epochs=5, fast_dev_run=False, checkpoint_callback=checkpoint_callback)       
    
    trainer.fit(model)
    trainer.test()    

In [None]:
trainer = train(0)
trainer = train(1)
trainer = train(2)
trainer = train(3)
trainer = train(4)

### Submission file <a id='submission' />

In [None]:
# Simple Average the folds
import pandas as pd
Submission0 = pd.read_csv('./submission0.csv')
Submission1 = pd.read_csv('./submission1.csv')
Submission2 = pd.read_csv('./submission2.csv')
Submission3 = pd.read_csv('./submission3.csv')
Submission4 = pd.read_csv('./submission4.csv')

Submission = pd.concat([Submission0, Submission1, Submission2, Submission3, Submission4]).groupby('image_name').mean().reset_index()
header = ["image_name","target"]
Submission.to_csv(f'submission.csv', columns = header, index=False)