### Tabular Convolution
In Tabular convolution, an innovative approach is used to create an image from a tabular sample. Here is the summaised approach.

1. Choose a sample image (you can consider this as a seed and experiment with various images)
2. Arrange the input row/sample as a kernel. 
3. Do a Conv2D on the sample image using this kernel. (In this notebook, I do this operation within the PyTorch model itself)
4. Use the resuting image as a sample and use it in your vision model.

It has started showing promising results but not yet close to top results. I do not have enough time to do vairous experiments. 

### Original paper

https://www.biorxiv.org/content/10.1101/2020.05.02.074203v1.full

In [None]:
from pathlib import Path
import subprocess

PL_PATH = Path("/kaggle/input/pytorch-lightning")
subprocess.call(
    ["pip", "install", PL_PATH / "pytorch_lightning-1.0.2-py3-none-any.whl"]
)

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
sys.path.append('../input/efficientnetpytorch') 

In [None]:

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import os
from PIL import Image
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from matplotlib import pyplot as plt
import seaborn as sns
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
from efficientnet_pytorch.utils import MemoryEfficientSwish
from efficientnet_pytorch import EfficientNet

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

In [None]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
test = pd.read_csv('../input/lish-moa/test_features.csv')
test.drop(columns=["sig_id"], inplace=True)
submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
remove_vehicle = True

if remove_vehicle:
    kept_index = train['cp_type']=='trt_cp'
    train = train.loc[kept_index].reset_index(drop=True)
    train_targets_scored = train_targets_scored.loc[kept_index].reset_index(drop=True)

train["cp_type"] = (train["cp_type"]=="trt_cp") + 0
train["cp_dose"] = (train["cp_dose"]=="D1") + 0

test["cp_type"] = (test["cp_type"]=="trt_cp") + 0
test["cp_dose"] = (test["cp_dose"]=="D1") + 0


### K-Fold stratificaiton including drug_id

In [None]:
def create_folds(seed_count, fold_count):

    folds = []

    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    train_drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')

    # Get rid of "ctl_vehicle" from training. 
    # You may comment below lines if you do not want to do it.
    train_targets_scored = train_targets_scored.loc[train_features['cp_type'] == 'trt_cp', :]
    train_features = train_features[train_features['cp_type'] == 'trt_cp']
    
    train_features_drug = train_features.merge(train_drug, on="sig_id", how='left')
    
    # Add drug_id as one of the targets (for stratifying later)
    targets = train_targets_scored.columns[1:]
    train_targets_scored = train_targets_scored.merge(train_drug, on='sig_id', how='left') 

    # Within in training data, identify indices where drug ids 
    # which are present in more than 18 rows and less than 18 rows 
    vc = train_targets_scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index
    vc2 = vc.loc[vc > 18].index

    # tmp is a dataframe derived from scored targets, where targets are 
    # averaged by drugid (one row per drug id)
    tmp = train_targets_scored.groupby('drug_id')[targets].mean().loc[vc1]
    tmp = tmp.reset_index()    
    tmp = tmp.rename(columns={"index":"drug_id"})
    
    # tmp1 is a dataframe with tagets and drug_id for all drugs that have 
    # repeated more that 18 times in train dataset.
    # We are stratifying these drugs as among all folds. 
    # Thought here is that such drugs might repeat in public/private test sets as well
    tmp1 = train_targets_scored[train_targets_scored['drug_id'].isin(vc2)]
    tmp1 = tmp1.reset_index(drop=True)

    for seed in range(seed_count):

        skf = MultilabelStratifiedKFold(n_splits = fold_count, shuffle = True, random_state = seed)
        tmp_copy = tmp.copy()
        tmp1_copy = tmp1.copy()
        train_indices = train_features_drug[['sig_id', 'drug_id']].copy()
        
        for fold,(idxT,idxV) in enumerate(skf.split(X=tmp_copy,y=tmp_copy[targets])):
            tmp_copy.loc[idxV,"kfold"] = fold
        train_indices = train_indices.merge(tmp_copy[['drug_id', 'kfold']], on='drug_id', how="left")

        for fold,(idxT,idxV) in enumerate(skf.split(X=tmp1_copy,y=tmp1_copy[targets])):
            tmp1_copy.loc[idxV,"kfold"] = fold        
        train_indices = train_indices.merge(tmp1_copy[['sig_id', 'kfold']], on='sig_id', how="left")

        train_indices['kfold'] = train_indices['kfold_x'].combine_first(train_indices['kfold_y'])        
        train_indices.drop(['drug_id', 'kfold_x', 'kfold_y'], inplace=True, axis=1) 
        
        # Add this to the output
        folds.append(train_indices)       

    return np.stack(folds)

In [None]:
# 1 fold, 5 seed
folded = create_folds(1, 5)

In [None]:
folded_data = pd.DataFrame(data=folded[0], columns=["sig_id", "kfold"])
train = train.merge(folded_data, on="sig_id", how="left")
train_targets_scored = train_targets_scored.merge(folded_data, on="sig_id", how="left")
train.drop(columns=["sig_id"], inplace=True)
train_targets_scored.drop(columns=["sig_id"], inplace=True)

### Normalise features

In [None]:
from sklearn import preprocessing

x = train.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

### Convert a sample (row of data) into a kernel

In [None]:
def getKernel(data):
    
    # Get the kernel ready for convolution
    nb_channels = 1
    demeaned_data = data - np.mean(data)
    kernel = np.concatenate((demeaned_data, np.zeros(86))).reshape(31,31)
    kernel = torch.from_numpy(kernel)
    kernel = kernel.view(1,1,31,31).repeat(1, nb_channels, 1, 1)
    
    # Convolution to get a new image
    return kernel.squeeze(0)

### Dataset

In [None]:
class MoAImageDataset(Dataset):
    
    def __init__(self, features, targets=None, transforms=None):
        self.features = features
        self.targets = targets
        self.transforms = transforms
        
    def __len__(self):
        return self.features.shape[0]
        
    def __getitem__(self, index):
        single_row = self.features[index]
        imageKernel = getKernel(single_row)
          
        return {
            "x": imageKernel,
            "y": torch.tensor(self.targets[index, :], dtype=torch.float)
        }

In [None]:
# mean = (0.485, 0.456, 0.406)
# std = (0.229, 0.224, 0.225)

mean = (0.485)
std = (0.229)

def get_train_transforms():
    return A.Compose([
#             A.HorizontalFlip(p=0.5),
#             A.VerticalFlip(p=0.5),
#             A.GaussianBlur(p=0.3),
            A.Normalize(mean, std, max_pixel_value=1, always_apply=True),
            ToTensorV2(),
        ], p=1.0)

def get_valid_transforms():
    return A.Compose([
#             A.Resize(224, 224),
            A.Normalize(mean, std, max_pixel_value=1, always_apply=True),
            ToTensorV2(),
        ], p=1.0)

def get_tta_transforms():
    return A.Compose([
#             A.Resize(224, 224),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.Normalize(mean, std, max_pixel_value=1, always_apply=True),
            ToTensorV2(),
        ], p=1.0)

### Data module

In [None]:
class MoADataModule(pl.LightningDataModule):
    def __init__(self, batch_size=64, fold=0):
        super().__init__()
        self.batch_size = batch_size
        self.fold = fold
              
    def setup(self, stage=None):
        # In multi-GPU training, this method is run on each GPU. 
        # So ideal for each training/valid split
        
        X_train, y_train = train[train['kfold'] != fold], train_targets_scored[train_targets_scored["kfold"] != fold]
        X_val, y_val = train[train['kfold'] == fold], train_targets_scored[train_targets_scored["kfold"] == fold]

        self.train_dataset = MoAImageDataset(X_train.iloc[:,:-1].values, y_train.iloc[:,:-1].values, transforms=get_train_transforms())
        self.valid_dataset = MoAImageDataset(X_val.iloc[:,:-1].values, y_val.iloc[:,:-1].values, transforms=get_valid_transforms())        

    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, self.batch_size, num_workers=4, shuffle=True, pin_memory=True)
    
    def val_dataloader(self):
        return DataLoader(self.valid_dataset, self.batch_size, num_workers=4, shuffle=False, pin_memory=True) 

### CNN Model (transfer learning)

This is the place we are creating the images required for learning. (i.e. converting tabular data to images)

In [None]:
# https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
class Model(nn.Module):
    def __init__(self, num_features, num_targets):
        super().__init__()
        
        # Get the base image ready for convolution
        base_image = plt.imread('../input/global-wheat-detection-512x512/train/026b6f389.jpg')
        
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
        normalised_image = self.normalize(torch.from_numpy(base_image/255).permute(2, 0, 1))
        self.batched_image = normalised_image.unsqueeze(0).to('cuda' if torch.cuda.is_available() else 'cpu')    
        self.grey_normalize = transforms.Normalize([0.485], [0.225])
                
        self.model = EfficientNet.from_pretrained('efficientnet-b0', in_channels=1, num_classes=206)  
        # Freeze BN layers
        for name, parameters in self.model.named_parameters():
            if '_bn' in name:
                parameters.requires_grad=False
        

    def forward(self, x):
        # Get the image ready
        target_image = F.conv2d(self.batched_image, x.repeat(1,3,1,1))
        target_image = target_image.permute(1,0,2,3).float()  
        
        # Resize to 224
        target_image = F.interpolate(target_image, 224)
        target_image = self.grey_normalize(target_image.squeeze(1))
        target_image = target_image.unsqueeze(1)
        
        #Actual forward
        x = self.model(target_image)
        return x

### Pytorch Lightning Model

In [None]:
class PLitMoAModule(pl.LightningModule):
    def __init__(self, hparams, model):
        super(PLitMoAModule, self).__init__()
        self.hparams = hparams
        self.model = model
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self, x):
        return self.model(x)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.hparams["lr"])
        scheduler = {"scheduler": 
                     torch.optim.lr_scheduler.ReduceLROnPlateau(
                        optimizer, patience=2, 
                        threshold=0.0003, 
                        factor = 0.5,
                        mode='min', verbose=True),
                    "interval": "epoch",
                    "monitor": "val_loss"}
        return [optimizer], [scheduler]
    
    def training_step(self, batch, batch_index):
        features = batch['x']
        targets = batch['y']
        out = self(features)
        loss = self.criterion(out, targets)
        logs = {"train_loss" : loss}
        return {"loss": loss, "log": logs, "progress_bar": logs}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
        logs = {"train_loss": avg_loss}
        return {"log": logs, "progress_bar": logs}
            
    def validation_step(self, batch, batch_index):
        features = batch['x']
        targets = batch['y']
        out = self(features)
        loss = self.criterion(out, targets)
        logs = {"val_loss" : loss}
        return {"loss": loss, "log": logs, "progress_bar": logs}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
        logs = {"val_loss": avg_loss}
        return {"log": logs, "progress_bar": logs}

### Fold training

In [None]:
LR = 0.001
for fold in range(5):  
    
    checkpoint_callback = ModelCheckpoint(
        filepath='./models/model_{epoch:02d}', 
        monitor='val_loss', verbose=False, 
        save_last=False, save_top_k=1, save_weights_only=False, 
        mode='min', period=1, prefix='')
    
    early_stop_callback = EarlyStopping(
       monitor='val_loss',
       min_delta=0.0001,
       patience=5,
       verbose=True,
       mode='min'
    )
    
    trainer = pl.Trainer(gpus=-1 if torch.cuda.is_available() else None, max_epochs=15, checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback])
    dm = MoADataModule(fold=fold, batch_size=128)
    
    net = Model(875, 206) # Input Features, Output Targets
    pylitModel = PLitMoAModule(hparams={"lr":LR}, model=net)
    trainer.fit(pylitModel, dm)
    
    print(checkpoint_callback.best_model_path)

### Prediction

### Submission

#### Yet to try

- DO not freeze BatchNorm
- Try different base images
- Check if the base image is within 0 to 1
- Try other optimizers like AdamW
- Try creating base images in a single go
- Try lower/higher batch size
- LR scheuling seems to have some good effect. Instead of default 0.1m try 0.5, 0.3 etc