In [None]:
DEBUG = False

<a class="anchor" id="0"></a>
# [Mechanisms of Action (MoA) Prediction](https://www.kaggle.com/c/lish-moa)

Forked from [MoA: Pytorch-RankGauss-PCA-NN upgrade & 3D visual](https://www.kaggle.com/vbmokin/moa-pytorch-rankgauss-pca-nn-upgrade-3d-visual) (and butchered beyond all recognition, probably).

## 1. Import libraries<a class="anchor" id="1"></a>

In [None]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
import random
import pandas as pd
import os
import copy
import gc

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import scipy.stats as stats
from scipy.stats import kurtosis

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss

from tqdm import tqdm, trange
from pprint import pprint
import warnings
#warnings.filterwarnings('ignore')

!cp -r ../input/pytorchtabnet/tabnet/* ./
from pytorch_tabnet.tab_model import TabNetRegressor

os.listdir('../input/lish-moa')

#pd.set_option('max_columns', 2000)

In [None]:
from sklearn.metrics import log_loss as _log_loss

def log_loss(y_true, y_pred):
    losses = []
    for col in range(y.shape[1]):
        losses.append(_log_loss(y_true[:,col], y_pred[:,col], eps=1e-15))
    
    return sum(losses) / y_true.shape[1]

## 2. Global parameters <a class="anchor" id="2"></a>

In [None]:
n_comp = 80
Dropout_Model = 0.25
QT_n_quantile_min=10, 
QT_n_quantile_max=200

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

## 3. Download data<a class="anchor" id="3"></a>

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
assert (train_features['sig_id'] == train_targets_scored['sig_id']).all()

train_features.drop('sig_id', axis=1, inplace=True)
train_targets_scored.drop('sig_id', axis=1, inplace=True)
train_targets_nonscored.drop('sig_id', axis=1, inplace=True)
test_features.drop('sig_id', axis=1, inplace=True)

In [None]:
train_cp = train_features['cp_type'] == 'trt_cp'
test_ctl = test_features['cp_type'] == 'ctl_vehicle'

ctl_df = sample_submission[test_ctl].iloc[:,1:]
ctl_df.values[:,:] = 0

#train_features = train_features[train_cp].drop('cp_type', axis=1)
#train_targets_scored = train_targets_scored[train_cp]
#test_features = test_features[~test_ctl].drop('cp_type', axis=1)

## 4. FE & Data Preprocessing <a class="anchor" id="4"></a>

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]
CATEGORICAL = ['cp_type', 'cp_time', 'cp_dose']

In [None]:
def encode_categorical(data):
    data = pd.get_dummies(data, columns=CATEGORICAL)
    return data

train_features.encoded = encode_categorical(train_features)
test_features.encoded  = encode_categorical(test_features)

### 4.2 Seed<a class="anchor" id="4.2"></a>

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

### 4.3 PCA features<a class="anchor" id="4.3"></a>

In [None]:
# PCA

print(f"Feature count: {len(train_features.columns)}")
all_features = pd.concat([train_features.encoded, test_features.encoded])

def reduce_dimensions(df, n_components, whiten=False): # randomized
    '''Handles numerical columns. Thin wrapper for sklearn.decomposition.PCA().'''
    pca = PCA(n_components=n_components)
    
    reduced = pca.fit_transform(df)
    print("\nPrincipal variances\n-------------------\n" + str(pca.explained_variance_))
    return reduced, pca.explained_variance_

temp_, variance = reduce_dimensions(all_features, n_components=130)

train_features.reduced = temp_[:train_features.shape[0]]
test_features.reduced = temp_[-test_features.shape[0]:]

In [None]:
vt = VarianceThreshold(.9)
all_features.threshold = vt.fit_transform(all_features.values)
train_features.threshold = all_features.threshold[:train_features.shape[0]]
test_features.threshold = all_features.threshold[-test_features.shape[0]:]

In [None]:
train_features.enhanced = np.concatenate((train_features.threshold, train_features.reduced), axis=1)
test_features.enhanced = np.concatenate((test_features.threshold, test_features.reduced), axis=1)

### 4.5 CV folds<a class="anchor" id="4.5"></a>

[Back to Table of Contents](#0.1)

In [None]:
def get_folds(X, y, n_splits=7):
    mskf = MultilabelStratifiedKFold(n_splits=n_splits)
    return mskf.split(X, y)

### 4.6 Dataset Classes<a class="anchor" id="4.6"></a>

[Back to Table of Contents](#0.1)

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float, device=DEVICE)
        self.targets = torch.tensor(targets, dtype=torch.float, device=DEVICE)
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : self.features[idx],
            'y' : self.targets[idx]         
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float, device=DEVICE)
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : self.features[idx]
        }
        return dct    

### 4.7 Smoothing<a class="anchor" id="4.7"></a>

[Back to Table of Contents](#0.1)

In [None]:
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

## 5. Modelling<a class="anchor" id="5"></a>

[Back to Table of Contents](#0.1)

In [None]:
# HyperParameters

EPOCHS = 200
BATCH_SIZE = 1024
LEARNING_RATE = .1e-2
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 100
EARLY_STOP = True
Dropout_model = 0.7

num_features = train_features.reduced.shape[1]
num_targets = len(train_targets_scored.columns)
hidden_size=200

In [None]:
class Chunk(nn.Module):
    def __init__(self, in_size, out_size, bn=True, dropout=True):
        super().__init__()
        self.in_size = in_size
        self.out_size = out_size
        if bn:
            self.bn = nn.BatchNorm1d(in_size)
        else:
            self.bn = nn.Identity()
        if dropout:
            self.dropout = nn.Dropout(Dropout_Model)
        else:
            self.dropout = nn.Identity()
        self.dense = nn.Linear(in_size, out_size) #nn.utils.weight_norm
    
    def forward(self, x):
        x = self.bn(x)
        x = self.dropout(x)
        x = self.dense(x)
        
        return x

class Bottleneck(nn.Module):
    def __init__(self, size, squeeze_factor=4, **kwargs):
        super().__init__()
        self.size = size
        self.bottleneck_size = size//squeeze_factor
        self.chunk1 = Chunk(size, self.bottleneck_size)
        self.chunk2 = Chunk(self.bottleneck_size, size, dropout=False)
    
    def forward(self, x):
        x_skip = x
        x = F.leaky_relu(self.chunk1(x))
        x = self.chunk2(x)
        
        return torch.cat([x, x_skip], dim=1) # out_size = size*2

class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super().__init__()
        self.chunk1 = Chunk(num_features, hidden_size)
        self.chunk2 = Chunk(hidden_size, hidden_size)
        self.chunk3 = Chunk(hidden_size, num_targets)
    
    def forward(self, x):
        x = F.leaky_relu(self.chunk1(x))
        x = F.leaky_relu(self.chunk2(x))
        x = self.chunk3(x)
        
        return x
    
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))    

## TABNET

    class TabNet(torch.nn.Module):
        def __init__(self, input_dim, output_dim, n_d=8, n_a=8,
                     n_steps=3, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1,
                     n_independent=2, n_shared=2, epsilon=1e-15,
                     virtual_batch_size=128, momentum=0.02, device_name='auto',
                     mask_type="sparsemax"):

In [None]:
from pytorch_tabnet.tab_network import TabNet as _TabNet

class TabNet(_TabNet):
    def forward(self, x):
        return super().forward(x)[0]

In [None]:
input_dim = train_features.encoded.shape[1]
output_dim = train_targets_scored.values.shape[1]

def get_tabnet(input_dim=input_dim, output_dim=output_dim):
    model = TabNet(input_dim, output_dim,
                   n_d=24, n_a=24,
                   n_steps=1, n_independent=2, n_shared=0)
    
    model.to(DEVICE)
    
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
    loss_fn = nn.BCEWithLogitsLoss()
    loss = loss_fn
    
    return model, optimizer, loss

In [None]:
def get_scheduler(optimizer, loader, epochs=EPOCHS):
    return optim.lr_scheduler.OneCycleLR(optimizer=optimizer, 
                                         pct_start=0.1,
                                         div_factor=40,
                                         max_lr=optimizer.defaults['lr'],
                                         epochs=epochs,
                                         steps_per_epoch=len(loader))

In [None]:
def get_model(num_features, num_targets, hidden_size):
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    ).to(DEVICE)
    
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
    loss_fn = nn.BCEWithLogitsLoss()
    loss_tr = SmoothBCEwLogits(smoothing =0.001)
    loss = loss_fn
    
    return model, optimizer, loss

def get_dataloader(*args, batch_size=BATCH_SIZE, shuffle=False):
    if len(args) == 1:
        dataset = TestDataset(*args)
    elif len(args) == 2:
        dataset = MoADataset(*args)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return loader

In [None]:
def train_epoch(model, optimizer, scheduler, loss_fn, dataloader, device=DEVICE, output=None):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'], data['y']
        if output is None:
            outputs = model(inputs)
        else:
            outputs = model(inputs)[output]
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


@torch.no_grad()
def valid_epoch(model, loss_fn, dataloader, device=DEVICE, output=None):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        if output is None:
            outputs = model(inputs)
        else:
            outputs = model(inputs)[output]
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

In [None]:
def train_session(model, train_data, valid_data, epochs=EPOCHS, scheduler=None, optimizer=None,
                  loss=None, session_name=None, patience=EARLY_STOPPING_STEPS, output=None):
    
    early_step = 0
    best_loss = np.inf
    session_stats = {'Session': session_name}
    
    if not hasattr(model, 'saved_states'):
        model.saved_states = []
    
    for epoch in range(epochs):
        
        train_loss = train_epoch(model, optimizer, scheduler, loss, train_data, output=output)
        valid_loss, valid_preds = valid_epoch(model, loss, valid_data, output=output)
        
        session_stats.update(
                                Epoch=epoch+1, 
                                train_loss=f"{train_loss:.5f}",
                                valid_loss=f"{valid_loss:.5f}",
                                lr=f"{scheduler.get_last_lr()[0]:.2}"
                            )
        
        print(", ".join(f"{key}: {value}" for key, value in session_stats.items()))
        
        if valid_loss < best_loss:
            early_step -=1
            best_loss = valid_loss
            torch.save(model.state_dict(), f"{session_name}.pt")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= patience):
                break
                
    model.saved_states.append(f"{session_name}.pt")
    
# Averaging on folds

def train_kfold(model, optimizer, loss, 
                x, y, 
                initialization, 
                epochs=EPOCHS, 
                prefix="", 
                batch_size=BATCH_SIZE,
                patience=EARLY_STOPPING_STEPS):
    
    y_pred = np.array(y, dtype=np.float)
    
    for n, fold in enumerate(get_folds(x, y)):
        
        session_name = prefix + f" Fold {n+1}" if prefix else f"Fold {n+1}"
        train_idx, valid_idx = fold
        
        trainloader = get_dataloader(x[train_idx], y[train_idx], shuffle=True, batch_size=batch_size)
        validloader = get_dataloader(x[valid_idx], y[valid_idx], shuffle=False, batch_size=batch_size)
        
        
        model.load_state_dict(initialization)
#        model, optimizer, loss = get_model()
#        loss_fn, loss_tr = loss
        
        scheduler = get_scheduler(optimizer, trainloader, epochs)
        
        train_session(model, trainloader, validloader, 
                      optimizer=optimizer, loss=loss,
                      session_name=session_name,
                      epochs=epochs,
                      scheduler=scheduler,
                      patience=patience)
        
        model.load_state_dict(torch.load(model.saved_states[-1]))
        y_pred[valid_idx] = model(validloader.dataset.features).sigmoid().detach().cpu().numpy()
        
    return y_pred

# Training

    def train_session(model, train_data, valid_data, epochs=EPOCHS, scheduler=None, optimizer=None,
                      loss=None, session_name=None, early_stopping_steps=EARLY_STOPPING_STEPS, output=None):
                  
    def train_kfold(model, optimizer, loss, x, y, initialization, epochs=EPOCHS, prefix="", output=None,
                    batch_size=BATCH_SIZE):


### Tabnet pretraining

In [None]:
RUN_PRETRAINING=False

if RUN_PRETRAINING:
    x, y = train_features.encoded.values, train_targets_nonscored.values
    
    trainloader = get_dataloader(x, y, shuffle=True, batch_size=2048)
    
    pretrain_tabnet, optimizer, loss = get_tabnet(input_dim=x.shape[1], output_dim=y.shape[1])
    optimizer.defaults['lr'] = .01
    scheduler = get_scheduler(optimizer=optimizer, loader=trainloader, epochs=40)
    
    train_session(pretrain_tabnet, trainloader, trainloader,
                  optimizer=optimizer, 
                  loss=loss, 
                  scheduler=scheduler,
                  session_name="pretrain",
                  epochs=40,
                  patience=1)
    
    pretrain = torch.load("pretrain.pt")
    pretrain.pop("tabnet.final_mapping.weight")
    torch.save(pretrain, "pretrain.pt")

In [None]:
LOAD_PRETRAIN = False
FREEZE_LAYERS = False

tabnet, optimizer, loss = get_tabnet(input_dim=train_features.encoded.shape[1])

torch.save(tabnet.state_dict(), "init.pt")
initialization = torch.load("init.pt")

if LOAD_PRETRAIN:
    pretrain = torch.load("pretrain.pt")
    #pretrain = torch.load("../input/tabnet-pretrain/pretrain.pt")
    
    for key in pretrain.keys():
        initialization[key] = pretrain[key]

if FREEZE_LAYERS:
    modules = dict(tabnet.tabnet.named_modules())
    unfreeze_modules = ['final_mapping', 'feat_transformers.1', 'att_transformers.1']
    unfreeze_params = [param for name in unfreeze_modules for param in modules[name].parameters()]
    
    optimizer = optim.Adam(unfreeze_params, lr=2e-02, weight_decay=WEIGHT_DECAY)

optimizer.defaults['lr'] = 2e-02

x, y = train_features.encoded.values, train_targets_scored.values

y_cv = train_kfold(tabnet, optimizer, loss, x, y, initialization=initialization, 
                   prefix="TabNet", epochs=200, patience=1, batch_size=2048)

In [None]:
y_true = train_targets_scored.values

print(f"CV Score: {log_loss(y_true,y_cv)}")

In [None]:
TRAIN_MLP = True

if TRAIN_MLP:
    mlp, optimizer, loss = get_model(num_features=train_features.enhanced.shape[1], 
                                     num_targets=train_targets_scored.shape[1], hidden_size=200)
    mlp.saved_states = []
    torch.save(mlp.state_dict(), "init.pt")
    optimizer.defaults['lr'] = 0.04
    
    initialization = torch.load("init.pt")
    
    x, y = train_features.enhanced, train_targets_scored.values
    y_cv = train_kfold(mlp, optimizer, loss, x, y, initialization=initialization, 
                       prefix="MLP", epochs=100, patience=1)

In [None]:
y_true = train_targets_scored.values

print(log_loss(y_true,y_cv))

## 6. Prediction & Submission <a class="anchor" id="6"></a>

[Back to Table of Contents](#0.1)

In [None]:
def predict(x, model):
    y_pred_ = []
    state_dicts = [torch.load(saved_state) for saved_state in model.saved_states]
    for sd in state_dicts:
        model.load_state_dict(sd)
        y_pred__ = model(x).sigmoid().detach().cpu().numpy()
        y_pred_.append(y_pred__)
        
    return sum(y_pred_) / len(state_dicts)

In [None]:
ENSEMBLE = True

target_cols = list(sample_submission.columns[1:])
test_idx = sample_submission.index

x = {'TabNet': torch.tensor(test_features.encoded.values, dtype=torch.float, device=DEVICE),
     'MLP': torch.tensor(test_features.enhanced, dtype=torch.float, device=DEVICE)}

if ENSEMBLE:
    models = {'MLP': mlp, 'TabNet': tabnet}
    y_preds = []
    for name in models:
        y_preds.append(predict(x[name], models[name]))
    y_pred = sum(y_preds) / len(models)
else:
    y_pred = predict(x['TabNet'], tabnet)

predictions_df = pd.DataFrame(y_pred, columns=target_cols)

sample_submission.update(predictions_df)
sample_submission.update(ctl_df)
sample_submission.to_csv('submission.csv', index=False)

[Go to Top](#0)