In [None]:
import sys
sys.path.append("../input/iterative-stratification/iterative-stratification-master")

import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import copy
import seaborn as sns
 
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

 
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_dir = '../input/lish-moa/'
os.listdir(data_dir)

In [None]:
train_features = pd.read_csv("/kaggle/input/lish-moa/train_features.csv")
train_targets_scored = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("/kaggle/input/lish-moa/train_targets_nonscored.csv")
train_targets_nonscored = train_targets_nonscored[train_features["cp_type"] != "ctl_vehicle"]
test_features = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")
sample_submission = pd.read_csv("/kaggle/input/lish-moa/sample_submission.csv")
drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')

print('train_features: {}'.format(train_features.shape))
print('train_targets_scored: {}'.format(train_targets_scored.shape))
print('train_targets_nonscored: {}'.format(train_targets_nonscored.shape))
print('train_drug: {}'.format(drug.shape))
print('test_features: {}'.format(test_features.shape))
print('sample_submission: {}'.format(sample_submission.shape))

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

print('GENES: {}'.format(GENES[:10]))
print('CELLS: {}'.format(CELLS[:10]))

In [None]:
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric

In [None]:
SEED = 42

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    
seed_everything(seed_value=SEED)


# Function to map and filter control groups
def mapping_filter(train, train_targets, test):
    
    # Encode time_dose variable for train and test
    time_dose = {(24, "D1"): 1., (24, "D2"): 2., (48, "D1"): 3., (48, "D2"): 4., (72, "D1"): 5., (72, "D2"): 6.}
    for df in [train, test]:
        df["time_dose"] = df[["cp_time", "cp_dose"]].apply(lambda row: (row["cp_time"], row["cp_dose"]), axis=1)
        df["time_dose"] = df["time_dose"].map(time_dose)
    
    # Select the ctl samples in the train and test
    ctl_train = train[train["cp_type"] == "ctl_vehicle"]
    ctl_test  = test[test["cp_type"] == "ctl_vehicle"]
    
    # remove observation with 'cp_type' == 'ctl_vehicle'
    train_targets = train_targets[train["cp_type"] != "ctl_vehicle"].reset_index(drop=True)
    train = train[train['cp_type'] != "ctl_vehicle"].reset_index(drop=True)
    test  = test[test['cp_type'] != "ctl_vehicle"].reset_index(drop=True)
    
    
    # Delete unnecessary columns
    del train["cp_type"], test["cp_type"]
    gc.collect()
    return train, test, train_targets, ctl_train, ctl_test



# Function to extract common stat features
def fe_stats(train, test):
    
    # Seperate the gene and cell viability
    g_list = [col for col in train.columns if "g-" in col]
    c_list = [col for col in train.columns if "c-" in col]
    gsquarecols=['g-574','g-211','g-216','g-0','g-255','g-577','g-153','g-389','g-60','g-370','g-248','g-167','g-203','g-177','g-301','g-332','g-517','g-6','g-744','g-224','g-162','g-3','g-736','g-486','g-283','g-22','g-359','g-361','g-440','g-335','g-106','g-307','g-745','g-146','g-416','g-298','g-666','g-91','g-17','g-549','g-145','g-157','g-768','g-568','g-396']
    
    # Put all statistical features in dataframe
    fe_train = pd.DataFrame(); fe_test = pd.DataFrame()
    fe_train["sig_id"] = train["sig_id"]; fe_test["sig_id"] = test["sig_id"]
    for df in [(fe_train, train), (fe_test, test)]:
        df[0]["g_mean"]    = df[1][g_list].mean(axis=1)
        df[0]["g_var"]     = df[1][g_list].var(axis=1)
        df[0]["g_kurt"]    = df[1][g_list].kurtosis(axis=1)
        df[0]["g_skew"]    = df[1][g_list].skew(axis=1)
        df[0]["g_extent"]  = df[1][g_list].max(axis=1) - df[1][g_list].min(axis=1)
        df[0]["g_mad"]     = df[1][g_list].mad(axis=1)
        df[0]["g_median"]  = df[1][g_list].median(axis=1)
        df[0]["g_inter_q"] = df[1][g_list].quantile(0.75, axis=1) - df[1][g_list].quantile(0.25, axis=1)
        df[0]["g_AV_a"]    = df[1][g_list].quantile(0.25, axis=1) - 1.5*(df[1][g_list].quantile(0.75, axis=1) - df[1][g_list].quantile(0.25, axis=1))
        df[0]["g_AV_b"]    = df[1][g_list].quantile(0.75, axis=1) + 1.5*(df[1][g_list].quantile(0.75, axis=1) - df[1][g_list].quantile(0.25, axis=1))
        df[0]["g_rsd"]     = df[1][g_list].std(axis=1) / df[1][g_list].mean(axis=1)
        df[0]["c_mean"]    = df[1][c_list].mean(axis=1)
        df[0]["c_var"]     = df[1][c_list].var(axis=1)
        df[0]["c_kurt"]    = df[1][c_list].kurtosis(axis=1)
        df[0]["c_skew"]    = df[1][c_list].skew(axis=1)
        df[0]["c_extent"]  = df[1][c_list].max(axis=1) - df[1][g_list].min(axis=1)
        df[0]["c_mad"]     = df[1][c_list].mad(axis=1)
        df[0]["c_median"]  = df[1][c_list].median(axis=1)
        df[0]["c_inter_q"] = df[1][c_list].quantile(0.75, axis=1) - df[1][g_list].quantile(0.25, axis=1)
        df[0]["c_AV_a"]    = df[1][c_list].quantile(0.25, axis=1) - 1.5*(df[1][g_list].quantile(0.75, axis=1) - df[1][g_list].quantile(0.25, axis=1))
        df[0]["c_AV_b"]    = df[1][c_list].quantile(0.75, axis=1) + 1.5*(df[1][g_list].quantile(0.75, axis=1) - df[1][g_list].quantile(0.25, axis=1))
        df[0]["c_rsd"]     = df[1][c_list].std(axis=1) / df[1][c_list].mean(axis=1)
        
        # Multiplication of c_cells
        df[0]['c52_c42'] = df[1]['c-52'] * df[1]['c-42']
        df[0]['c13_c73'] = df[1]['c-13'] * df[1]['c-73']
        df[0]['c26_c13'] = df[1]['c-23'] * df[1]['c-13']
        df[0]['c33_c6']  = df[1]['c-33'] * df[1]['c-6']
        df[0]['c11_c55'] = df[1]['c-11'] * df[1]['c-55']
        df[0]['c38_c63'] = df[1]['c-38'] * df[1]['c-63']
        df[0]['c38_c94'] = df[1]['c-38'] * df[1]['c-94']
        df[0]['c13_c94'] = df[1]['c-13'] * df[1]['c-94']
        df[0]['c4_c52']  = df[1]['c-4']  * df[1]['c-52']
        df[0]['c4_c42']  = df[1]['c-4']  * df[1]['c-42']
        df[0]['c13_c38'] = df[1]['c-13'] * df[1]['c-38']
        df[0]['c55_c2']  = df[1]['c-55'] * df[1]['c-2']
        df[0]['c55_c4']  = df[1]['c-55'] * df[1]['c-4']
        df[0]['c4_c13']  = df[1]['c-4']  * df[1]['c-13']
        df[0]['c82_c42'] = df[1]['c-82'] * df[1]['c-42']
        df[0]['c66_c42'] = df[1]['c-66'] * df[1]['c-42']
        df[0]['c6_c38']  = df[1]['c-6']  * df[1]['c-38']
        df[0]['c2_c13']  = df[1]['c-2']  * df[1]['c-13']
        df[0]['c62_c42'] = df[1]['c-62'] * df[1]['c-42']
        df[0]['c90_c55'] = df[1]['c-90'] * df[1]['c-55']
        
        
        for feature in c_list:
             df[0][f'{feature}_squared'] = df[1][feature] ** 2     
                
        for feature in gsquarecols:
            df[0][f'{feature}_squared'] = df[1][feature] ** 2   
    
    return fe_train, fe_test


def ctl_fe_stats(train, test, ctl_train, ctl_test):
    
    # Seperate the gene and cell viability
    g_list = [col for col in train_features.columns if "g-" in col]
    c_list = [col for col in train_features.columns if "c-" in col]
    
    # Put all statistical features in dataframe
    ctl_fe_train = pd.DataFrame(); ctl_fe_test = pd.DataFrame()
    ctl_fe_train["sig_id"] = train_features["sig_id"]; ctl_fe_test["sig_id"] = test_features["sig_id"]
    ctl_fe_train["time_dose"] = train_features["time_dose"]; ctl_fe_test["time_dose"] = test_features["time_dose"]

    
          
    # Stat features with ctl_samples
    for df in [(ctl_fe_train, ctl_train, train), (ctl_fe_test, ctl_test, test)]:
        ctl_g_mean = df[1].groupby("time_dose")[g_list].mean().mean(axis=1)
        ctl_c_mean = df[1].groupby("time_dose")[c_list].mean().mean(axis=1)
        df[0]["diff_g_mean"] = df[2]["g_mean"] - df[0]["time_dose"].map(ctl_g_mean)
        df[0]["diff_c_mean"] = df[2]["c_mean"] - df[0]["time_dose"].map(ctl_c_mean)
        ctl_g_var = df[1].groupby("time_dose")[g_list].var().var(axis=1)
        ctl_c_var = df[1].groupby("time_dose")[c_list].var().var(axis=1)
        df[0]["diff_g_var"] = df[2]["g_var"] - df[0]["time_dose"].map(ctl_g_var)
        df[0]["diff_c_var"] = df[2]["c_var"] - df[0]["time_dose"].map(ctl_c_var)
        ctl_g_extent = df[1].groupby("time_dose")[g_list].max().max(axis=1) - df[1].groupby("time_dose")[g_list].min().min(axis=1)
        ctl_c_extent = df[1].groupby("time_dose")[c_list].max().max(axis=1) - df[1].groupby("time_dose")[c_list].max().max(axis=1)
        df[0]["diff_g_extent"] = df[2]["g_extent"] - df[0]["time_dose"].map(ctl_g_extent)
        df[0]["diff_c_extent"] = df[2]["c_extent"] - df[0]["time_dose"].map(ctl_c_extent)
        ctl_g_mad = df[1].groupby("time_dose")[g_list].mad().mad(axis=1)  
        ctl_c_mad = df[1].groupby("time_dose")[c_list].mad().mad(axis=1) 
        df[0]["diff_g_mad"] = df[2]["g_mad"] -  df[0]["time_dose"].map(ctl_g_mad)
        df[0]["diff_c_mad"] = df[2]["c_mad"] -  df[0]["time_dose"].map(ctl_c_mad)
        ctl_g_median = df[1].groupby("time_dose")[g_list].median().median(axis=1)  
        ctl_c_median = df[1].groupby("time_dose")[c_list].median().median(axis=1) 
        df[0]["diff_g_median"] = df[2]["g_median"] - df[0]["time_dose"].map(ctl_g_median)
        df[0]["diff_c_median"] = df[2]["c_median"] - df[0]["time_dose"].map(ctl_c_median)
        ctl_g_inter_q = df[1].groupby("time_dose")[g_list].quantile(0.75).quantile(0.75, axis=1) - df[1].groupby("time_dose")[g_list].quantile(0.25).quantile(0.25, axis=1)   
        ctl_c_inter_q = df[1].groupby("time_dose")[c_list].quantile(0.75).quantile(0.75, axis=1) - df[1].groupby("time_dose")[c_list].quantile(0.25).quantile(0.25, axis=1)
        df[0]["diff_g_inter_q"] = df[2]["g_inter_q"] - df[0]["time_dose"].map(ctl_g_inter_q)
        df[0]["diff_c_inter_q"] = df[2]["c_inter_q"] - df[0]["time_dose"].map(ctl_c_inter_q)
        ctl_g_AV_a = df[1].groupby("time_dose")[g_list].quantile(0.25).quantile(0.25, axis=1) - 1.5*(df[1].groupby("time_dose")[g_list].quantile(0.75).quantile(0.75, axis=1) - df[1].groupby("time_dose")[g_list].quantile(0.25).quantile(0.25, axis=1))  
        ctl_c_AV_a = df[1].groupby("time_dose")[c_list].quantile(0.25).quantile(0.25, axis=1) - 1.5*(df[1].groupby("time_dose")[c_list].quantile(0.75).quantile(0.75, axis=1) - df[1].groupby("time_dose")[c_list].quantile(0.25).quantile(0.25, axis=1)) 
        df[0]["diff_g_AV_a"] = df[2]["g_AV_a"] - df[0]["time_dose"].map(ctl_g_AV_a)
        df[0]["diff_c_AV_a"] = df[2]["c_AV_a"] - df[0]["time_dose"].map(ctl_c_AV_a)
        ctl_g_AV_b = df[1].groupby("time_dose")[g_list].quantile(0.75).quantile(0.75, axis=1) - 1.5*(df[1].groupby("time_dose")[g_list].quantile(0.75).quantile(0.75, axis=1) - df[1].groupby("time_dose")[g_list].quantile(0.25).quantile(0.25, axis=1))  
        ctl_c_AV_b = df[1].groupby("time_dose")[c_list].quantile(0.75).quantile(0.75, axis=1) + 1.5*(df[1].groupby("time_dose")[c_list].quantile(0.75).quantile(0.75, axis=1) - df[1].groupby("time_dose")[c_list].quantile(0.25).quantile(0.25, axis=1)) 
        df[0]["diff_g_AV_b"] = df[2]["g_AV_b"] - df[0]["time_dose"].map(ctl_g_AV_b)
        df[0]["diff_c_AV_b"] = df[2]["c_AV_b"] - df[0]["time_dose"].map(ctl_c_AV_b)
        ctl_g_rsd = df[1].groupby("time_dose")[g_list].std().std(axis=1) / df[1].groupby("time_dose")[g_list].mean().mean(axis=1) 
        ctl_c_rsd = df[1].groupby("time_dose")[c_list].std().std(axis=1) / df[1].groupby("time_dose")[c_list].mean().mean(axis=1)
        df[0]["diff_g_rsd"] = df[2]["g_rsd"] - df[0]["time_dose"].map(ctl_g_AV_b)
        df[0]["diff_c_rsd"] = df[2]["c_rsd"] - df[0]["time_dose"].map(ctl_c_AV_b)
        
    del ctl_fe_train["time_dose"], ctl_fe_test["time_dose"]   
    gc.collect()
    return ctl_fe_train, ctl_fe_test

In [None]:
qt = QuantileTransformer(n_quantiles=100,random_state=42,output_distribution='normal')
train_features[GENES+CELLS] = qt.fit_transform(train_features[GENES+CELLS])
test_features[GENES+CELLS] = qt.transform(test_features[GENES+CELLS])

# feature Selection using Variance Encoding

In [None]:
# Data preparation for the training phase
train_features, test_features, train_features_scored, ctl_train, ctl_test= mapping_filter(train_features, train_targets_scored, test_features)
fe_train_features, fe_test_features = fe_stats(train_features, test_features)
ctl_fe_train, ctl_fe_test = ctl_fe_stats(fe_train_features, fe_test_features, ctl_train, ctl_test)
#pca_feat_train, pca_feat_test = pca_features(train_features, test_features)
# Merge all the engineered features for train and test data
train_features = pd.merge(train_features, fe_train_features, on="sig_id")
train_features = pd.merge(train_features, ctl_fe_train, on="sig_id")
#train_features = pd.merge(train_features, pca_feat_train, on='sig_id')

test_features = pd.merge(test_features, fe_test_features,    on="sig_id")
test_features = pd.merge(test_features, ctl_fe_test, on="sig_id")
#test_features = pd.merge(test_features, pca_feat_test,       on="sig_id")

In [None]:
# GENES
n_comp = 50

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[GENES]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
#CELLS
n_comp = 15

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[CELLS]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [None]:
c_n = [f for f in list(train_features.columns) if f not in ['sig_id', 'cp_time', 'cp_dose', 'time_dose']]
mask = (train_features[c_n].var() >= 0.85).values
tmp = train_features[c_n].loc[:, mask]
train_features = pd.concat([train_features[['sig_id', 'cp_dose', 'cp_time']], tmp], axis=1)
tmp = test_features[c_n].loc[:, mask]
test_features = pd.concat([test_features[['sig_id', 'cp_dose', 'cp_time']], tmp], axis=1)

In [None]:
target_cols = [x for x in train_targets_scored.columns if x != 'sig_id']
aux_target_cols = [x for x in train_targets_nonscored.columns if x != 'sig_id']
all_target_cols = target_cols + aux_target_cols
train  = train_features.merge(train_targets_scored, on="sig_id")
train  = train.merge(train_targets_nonscored, on="sig_id")
train = train.merge(drug, on="sig_id", how="left")
train_5layers = train.copy()
train_4layers = train.copy()
train_3layers = train.copy()
train_tabnet = train.copy()
train_blend = train.copy()
target = train[target_cols]
test = test_features.copy()

In [None]:
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    data = data.drop(["cp_dose_D1", "cp_time_72"], axis=1)
    return data

In [None]:
feature_cols = [c for c in process_data(train).columns if c not in all_target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id', 'drug_id']]
num_all_targets = len(all_target_cols)
num_targets = len(target_cols)
num_aux_targets = len(aux_target_cols)
num_features = len(feature_cols)
num_features

In [None]:
print(train.shape)
print(test.shape)
print(sample_submission.shape)

# Dataset Classes

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }

        return dct

In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
      
        scheduler.step()

        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    return final_loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    return preds

In [None]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class NormalLinear(nn.Module):
    """ 
    Linear layer with normalized weights
    """
    def __init__(self, size_in, size_out, bias=True):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        # weights vector
        weights_v = torch.Tensor(size_out, size_in)
        nn.init.kaiming_uniform_(weights_v, a=np.sqrt(5)) 
        self.weights_v = nn.Parameter(weights_v)
        # weights magnitude
        weights_m = torch.norm(weights_v, dim=1, keepdim=True)
        self.weights_m = nn.Parameter(weights_m)

        if bias:
            bias_v = torch.Tensor(size_out)    
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weights_v)
            bound = 1 / np.sqrt(fan_in)
            nn.init.uniform_(bias_v, -bound, bound)
            self.bias = nn.Parameter(bias_v)
        else:
            self.register_parameter('bias', None)

        self._normalize_weights()

    def _normalize_weights(self):
        with torch.set_grad_enabled(False):
            norm_per_output = torch.norm(self.weights_v, dim=1, keepdim=True)
            self.weights_v.div_(norm_per_output)

    def forward(self, x):
        self._normalize_weights()
        return nn.functional.linear(x, self.weights_v * self.weights_m, self.bias)


class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1

        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
            
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

# Model

In [None]:
class NNet_5Layers(nn.Module):
    def __init__(self, num_features, num_targets):
        super(NNet_5Layers, self).__init__()
        self.hidden_size = [1500, 1250, 1000, 750]
        self.dropout_value = [0.45, 0.25, 0.2, 0.15]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = nn.Linear(self.hidden_size[1], self.hidden_size[2])

        self.batch_norm4 = nn.BatchNorm1d(self.hidden_size[2])
        self.dropout4 = nn.Dropout(self.dropout_value[2])
        self.dense4 = nn.Linear(self.hidden_size[2], self.hidden_size[3])

        self.batch_norm5 = nn.BatchNorm1d(self.hidden_size[3])
        self.dropout5 = nn.Dropout(self.dropout_value[3])
        self.dense5 = NormalLinear(self.hidden_size[3], num_targets)
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))

        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = F.leaky_relu(self.dense4(x))

        x = self.batch_norm5(x)
        x = self.dropout5(x)
        x = self.dense5(x)
        return x


In [None]:
class FineTuneScheduler_5Layers:
    def __init__(self, epochs):
        self.epochs = epochs
        self.epochs_per_step = 0
        self.frozen_layers = []

    def copy_without_top(self, model, num_features, num_targets, num_targets_new, last_layer):
        self.frozen_layers = []

        model_new = NNet_5Layers(num_features, num_targets)
        model_new.load_state_dict(model.state_dict())

        # Freeze all weights
        for name, param in model_new.named_parameters():
            layer_index = int(name.split('.')[0][-1])

            if layer_index == last_layer:
                continue

            param.requires_grad = False

            # Save frozen layer names
            if layer_index not in self.frozen_layers:
                self.frozen_layers.append(layer_index)

        self.epochs_per_step = self.epochs // len(self.frozen_layers)

        # Replace the top layers with another ones
        model_new.batch_norm5 = nn.BatchNorm1d(model_new.hidden_size[3])
        model_new.dropout5 = nn.Dropout(model_new.dropout_value[3])
        model_new.dense5 = NormalLinear(model_new.hidden_size[-1], num_targets_new)
        model_new = model_new.to(DEVICE)
        return model_new

    def step(self, epoch, model):
        if len(self.frozen_layers) == 0:
            return

        if epoch % self.epochs_per_step == 0:
            last_frozen_index = self.frozen_layers[-1]
            
            # Unfreeze parameters of the last frozen layer
            for name, param in model.named_parameters():
                layer_index = name.split('.')[0][-1]

                if layer_index == last_frozen_index:
                    param.requires_grad = True

            del self.frozen_layers[-1]  # Remove the last layer as unfrozen

In [None]:
class NNet_4Layers(nn.Module):
    def __init__(self, num_features, num_targets):
        super(NNet_4Layers, self).__init__()
        self.hidden_size = [1500, 1250, 1000]
        self.dropout_value = [0.35, 0.2, 0.15]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = nn.Linear(self.hidden_size[1], self.hidden_size[2])

        self.batch_norm4 = nn.BatchNorm1d(self.hidden_size[2])
        self.dropout4 = nn.Dropout(self.dropout_value[2])
        self.dense4 = NormalLinear(self.hidden_size[2], num_targets)

    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))

        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = self.dense4(x)

        return x

In [None]:
class FineTuneScheduler_4Layers:
    def __init__(self, epochs):
        self.epochs = epochs
        self.epochs_per_step = 0
        self.frozen_layers = []

    def copy_without_top(self, model, num_features, num_targets, num_targets_new, last_layer):
        self.frozen_layers = []

        model_new = NNet_4Layers(num_features, num_targets)
        model_new.load_state_dict(model.state_dict())

        # Freeze all weights
        for name, param in model_new.named_parameters():
            layer_index = int(name.split('.')[0][-1])

            if layer_index == last_layer:
                continue

            param.requires_grad = False

            # Save frozen layer names
            if layer_index not in self.frozen_layers:
                self.frozen_layers.append(layer_index)

        self.epochs_per_step = self.epochs // len(self.frozen_layers)

        # Replace the top layers with another ones
        model_new.batch_norm4 = nn.BatchNorm1d(model_new.hidden_size[2])
        model_new.dropout4 = nn.Dropout(model_new.dropout_value[2])
        model_new.dense4 = NormalLinear(model_new.hidden_size[-1], num_targets_new)
        model_new.to(DEVICE)
        return model_new

    def step(self, epoch, model):
        if len(self.frozen_layers) == 0:
            return

        if epoch % self.epochs_per_step == 0:
            last_frozen_index = self.frozen_layers[-1]
            
            # Unfreeze parameters of the last frozen layer
            for name, param in model.named_parameters():
                layer_index = name.split('.')[0][-1]

                if layer_index == last_frozen_index:
                    param.requires_grad = True

            del self.frozen_layers[-1]  # Remove the last layer as unfrozen

In [None]:
class NNet_3Layers(nn.Module):
    def __init__(self, num_features, num_targets):
        super(NNet_3Layers, self).__init__()
        self.hidden_size = [1500, 1250]
        self.dropout_value = [0.35, 0.2]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = NormalLinear(self.hidden_size[1], num_targets)

      
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)


        return x

In [None]:
class FineTuneScheduler_3Layers:
    def __init__(self, epochs):
        self.epochs = epochs
        self.epochs_per_step = 0
        self.frozen_layers = []

    def copy_without_top(self, model, num_features, num_targets, num_targets_new, last_layer):
        self.frozen_layers = []

        model_new = NNet_3Layers(num_features, num_targets)
        model_new.load_state_dict(model.state_dict())

        # Freeze all weights
        for name, param in model_new.named_parameters():
            layer_index = int(name.split('.')[0][-1])

            if layer_index == last_layer:
                continue

            param.requires_grad = False

            # Save frozen layer names
            if layer_index not in self.frozen_layers:
                self.frozen_layers.append(layer_index)

        self.epochs_per_step = self.epochs // len(self.frozen_layers)

        # Replace the top layers with another ones
        model_new.batch_norm3 = nn.BatchNorm1d(model_new.hidden_size[1])
        model_new.dropout3 = nn.Dropout(model_new.dropout_value[1])
        model_new.dense3 = NormalLinear(model_new.hidden_size[-1], num_targets_new)
        model_new.to(DEVICE)
        return model_new

    def step(self, epoch, model):
        if len(self.frozen_layers) == 0:
            return

        if epoch % self.epochs_per_step == 0:
            last_frozen_index = self.frozen_layers[-1]
            
            # Unfreeze parameters of the last frozen layer
            for name, param in model.named_parameters():
                layer_index = name.split('.')[0][-1]

                if layer_index == last_frozen_index:
                    param.requires_grad = True

            del self.frozen_layers[-1]  # Remove the last layer as unfrozen

# Preprocessing steps

In [None]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 128

WEIGHT_DECAY = {'ALL_TARGETS': 1e-5, 'SCORED_ONLY': 1e-6}
MAX_LR = {'ALL_TARGETS': 1e-2, 'SCORED_ONLY': 1e-3}
DIV_FACTOR = {'ALL_TARGETS': 1e3, 'SCORED_ONLY': 1e2}
PCT_START = 0.1

# Single fold training

In [None]:
from sklearn.model_selection import KFold

def make_cv_folds(train, SEED, NFOLDS):
    vc = train.drug_id.value_counts()
    vc1 = vc.loc[(vc==6)|(vc==7)|(vc==8)|(vc==11)|(vc==12)|(vc==13)|(vc==14)|(vc==18)].index.sort_values()
    vc2 = vc.loc[(vc!=6)&(vc!=7)&(vc!=8)&(vc!=11)&(vc!=12)&(vc!=13)&(vc!=14)&(vc!=18)].index.sort_values()

    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}
    dct2 = {}

    skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, 
                                        random_state=SEED)
    tmp = train.groupby('drug_id')[target_cols].mean().loc[vc1]

    for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
        dd = {k: fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf =  KFold(n_splits=NFOLDS, shuffle=True, 
                     random_state=SEED)
    tmp = train.loc[train.drug_id.isin(vc2)].reset_index(drop=True)

    for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
        dd = {k: fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    train["kfold"] = train.drug_id.map(dct1)
    train.loc[train["kfold"].isna(), "kfold"] = train.loc[train["kfold"].isna(), 'sig_id'].map(dct2)
    train["kfold"] = train["kfold"].astype('int8')
        
    return train

NFOLDS = 14

train = make_cv_folds(train, SEED, NFOLDS)
folds = train.copy()
train.head()

In [None]:
def training_5layersnnet(fold, last_layer, seed):
    seed_everything(seed)
    
    train_ = process_data(train)
    test_ = process_data(test)
    
    trn_idx = train_[train_["kfold"] != fold].index
    val_idx = train_[train_["kfold"] == fold].index
    
    train_df = train_[train_["kfold"] != fold].reset_index(drop=True)
    valid_df = train_[train_["kfold"] == fold].reset_index(drop=True)
    
    def train_model(model, tag_name, target_cols_now, fine_tune_scheduler=None):
        x_train, y_train  = train_df[feature_cols].values, train_df[target_cols_now].values
        x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols_now].values
        
        train_dataset = MoADataset(x_train, y_train)
        valid_dataset = MoADataset(x_valid, y_valid)

        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=WEIGHT_DECAY[tag_name])
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                  steps_per_epoch=len(trainloader),
                                                  pct_start=PCT_START,
                                                  div_factor=DIV_FACTOR[tag_name], 
                                                  max_lr=MAX_LR[tag_name],
                                                  epochs=EPOCHS)
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss_tr = SmoothBCEwLogits(smoothing=0.0001)
        
      


        oof = np.zeros((len(train), len(target_cols_now)))
        best_loss = np.inf
        
        for epoch in range(EPOCHS):
            if fine_tune_scheduler is not None:
                fine_tune_scheduler.step(epoch, model)

            train_loss = train_fn(model, optimizer, scheduler, loss_tr, trainloader, DEVICE)
            valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
            print(f"FOLD: {fold}, {tag_name}, EPOCH: {epoch}, train_loss: {train_loss:.6f}, valid_loss: {valid_loss:.6f}")

            if np.isnan(valid_loss):
                break
            
            if valid_loss < best_loss:
                best_loss = valid_loss
                oof[val_idx] = valid_preds
                torch.save(model.state_dict(), f"{tag_name}_FOLD{fold}_.pth")

        return oof

    fine_tune_scheduler = FineTuneScheduler_5Layers(EPOCHS)

    pretrained_model = NNet_5Layers(num_features, num_all_targets)
    pretrained_model.to(DEVICE)
    

    # Train on scored + nonscored targets
    train_model(pretrained_model, 'ALL_TARGETS', all_target_cols)

    # Load the pretrained model with the best loss
    pretrained_model = NNet_5Layers(num_features, num_all_targets)
    pretrained_model.load_state_dict(torch.load(f"ALL_TARGETS_FOLD{fold}_.pth"))
    pretrained_model.to(DEVICE)

    # Copy model without the top layer
    final_model = fine_tune_scheduler.copy_without_top(pretrained_model, num_features, num_all_targets, num_targets, last_layer)

    # Fine-tune the model on scored targets only
    oof = train_model(final_model, 'SCORED_ONLY', target_cols, fine_tune_scheduler)

    # Load the fine-tuned model with the best loss
    model = NNet_5Layers(num_features, num_targets)
    model.load_state_dict(torch.load(f"SCORED_ONLY_FOLD{fold}_.pth"))
    model.to(DEVICE)

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = np.zeros((len(test_), num_targets))
    predictions = inference_fn(model, testloader, DEVICE)
    return oof, predictions

In [None]:
def training_4layersnnet(fold, last_layer, seed):
    seed_everything(seed)
    
    train_ = process_data(train)
    test_ = process_data(test)
    
    trn_idx = train_[train_["kfold"] != fold].index
    val_idx = train_[train_["kfold"] == fold].index
    
    train_df = train_[train_["kfold"] != fold].reset_index(drop=True)
    valid_df = train_[train_["kfold"] == fold].reset_index(drop=True)
    
    def train_model(model, tag_name, target_cols_now, fine_tune_scheduler=None):
        x_train, y_train  = train_df[feature_cols].values, train_df[target_cols_now].values
        x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols_now].values
        
        train_dataset = MoADataset(x_train, y_train)
        valid_dataset = MoADataset(x_valid, y_valid)

        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=WEIGHT_DECAY[tag_name])
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                  steps_per_epoch=len(trainloader),
                                                  pct_start=PCT_START,
                                                  div_factor=DIV_FACTOR[tag_name], 
                                                  max_lr=MAX_LR[tag_name],
                                                  epochs=EPOCHS)
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss_tr = SmoothBCEwLogits(smoothing=0.0001)

        oof = np.zeros((len(train), len(target_cols_now)))
        best_loss = np.inf
        
        for epoch in range(EPOCHS):
            if fine_tune_scheduler is not None:
                fine_tune_scheduler.step(epoch, model)

            train_loss = train_fn(model, optimizer, scheduler, loss_tr, trainloader, DEVICE)
            valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
            print(f"FOLD: {fold}, {tag_name}, EPOCH: {epoch}, train_loss: {train_loss:.6f}, valid_loss: {valid_loss:.6f}")

            if np.isnan(valid_loss):
                break
            
            if valid_loss < best_loss:
                best_loss = valid_loss
                oof[val_idx] = valid_preds
                torch.save(model.state_dict(), f"{tag_name}_FOLD{fold}_.pth")

        return oof

    fine_tune_scheduler = FineTuneScheduler_4Layers(EPOCHS)

    pretrained_model = NNet_4Layers(num_features, num_all_targets)
    pretrained_model.to(DEVICE)

    # Train on scored + nonscored targets
    train_model(pretrained_model, 'ALL_TARGETS', all_target_cols)

    # Load the pretrained model with the best loss
    pretrained_model = NNet_4Layers(num_features, num_all_targets)
    pretrained_model.load_state_dict(torch.load(f"ALL_TARGETS_FOLD{fold}_.pth"))
    pretrained_model.to(DEVICE)

    # Copy model without the top layer
    final_model = fine_tune_scheduler.copy_without_top(pretrained_model, num_features, num_all_targets, num_targets, last_layer)

    # Fine-tune the model on scored targets only
    oof = train_model(final_model, 'SCORED_ONLY', target_cols, fine_tune_scheduler)

    # Load the fine-tuned model with the best loss
    model = NNet_4Layers(num_features, num_targets)
    model.load_state_dict(torch.load(f"SCORED_ONLY_FOLD{fold}_.pth"))
    model.to(DEVICE)

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = np.zeros((len(test_), num_targets))
    predictions = inference_fn(model, testloader, DEVICE)
    return oof, predictions

In [None]:
def training_3layersnnet(fold, last_layer, seed):
    seed_everything(seed)
    
    train_ = process_data(train)
    test_ = process_data(test)
    
    trn_idx = train_[train_["kfold"] != fold].index
    val_idx = train_[train_["kfold"] == fold].index
    
    train_df = train_[train_["kfold"] != fold].reset_index(drop=True)
    valid_df = train_[train_["kfold"] == fold].reset_index(drop=True)
    
    def train_model(model, tag_name, target_cols_now, fine_tune_scheduler=None):
        x_train, y_train  = train_df[feature_cols].values, train_df[target_cols_now].values
        x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols_now].values
        
        train_dataset = MoADataset(x_train, y_train)
        valid_dataset = MoADataset(x_valid, y_valid)

        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=WEIGHT_DECAY[tag_name])
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                  steps_per_epoch=len(trainloader),
                                                  pct_start=PCT_START,
                                                  div_factor=DIV_FACTOR[tag_name], 
                                                  max_lr=MAX_LR[tag_name],
                                                  epochs=EPOCHS)
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss_tr = SmoothBCEwLogits(smoothing=0.0001)

        oof = np.zeros((len(train), len(target_cols_now)))
        best_loss = np.inf
        
        for epoch in range(EPOCHS):
            if fine_tune_scheduler is not None:
                fine_tune_scheduler.step(epoch, model)

            train_loss = train_fn(model, optimizer, scheduler, loss_tr, trainloader, DEVICE)
            valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
            print(f"FOLD: {fold}, {tag_name}, EPOCH: {epoch}, train_loss: {train_loss:.6f}, valid_loss: {valid_loss:.6f}")

            if np.isnan(valid_loss):
                break
            
            if valid_loss < best_loss:
                best_loss = valid_loss
                oof[val_idx] = valid_preds
                torch.save(model.state_dict(), f"{tag_name}_FOLD{fold}_.pth")

        return oof

    fine_tune_scheduler = FineTuneScheduler_3Layers(EPOCHS)

    pretrained_model = NNet_3Layers(num_features, num_all_targets)
    pretrained_model.to(DEVICE)

    # Train on scored + nonscored targets
    train_model(pretrained_model, 'ALL_TARGETS', all_target_cols)

    # Load the pretrained model with the best loss
    pretrained_model = NNet_3Layers(num_features, num_all_targets)
    pretrained_model.load_state_dict(torch.load(f"ALL_TARGETS_FOLD{fold}_.pth"))
    pretrained_model.to(DEVICE)

    # Copy model without the top layer
    final_model = fine_tune_scheduler.copy_without_top(pretrained_model, num_features, num_all_targets, num_targets, last_layer)

    # Fine-tune the model on scored targets only
    oof = train_model(final_model, 'SCORED_ONLY', target_cols, fine_tune_scheduler)

    # Load the fine-tuned model with the best loss
    model = NNet_3Layers(num_features, num_targets)
    model.load_state_dict(torch.load(f"SCORED_ONLY_FOLD{fold}_.pth"))
    model.to(DEVICE)

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = np.zeros((len(test_), num_targets))
    predictions = inference_fn(model, testloader, DEVICE)
    return oof, predictions

In [None]:
from time import time

# Averaging on multiple SEEDS
oof_5layers = np.zeros((len(train), len(target_cols)))
predictions_5layers = np.zeros((len(test), len(target_cols)))

time_begin = time()

for fold in range(NFOLDS):
        oof_, pred_ = training_5layersnnet(fold, 5, SEED)
        predictions_5layers += pred_ / NFOLDS
        oof_5layers += oof_

time_diff = time() - time_begin

train_5layers[target_cols] = oof_5layers

In [None]:
from time import time

# Averaging on multiple SEEDS
oof_4layers = np.zeros((len(train), len(target_cols)))
predictions_4layers = np.zeros((len(test), len(target_cols)))

time_begin = time()

for fold in range(NFOLDS):
        oof_, pred_ = training_4layersnnet(fold, 4, SEED)
        predictions_4layers += pred_ / NFOLDS
        oof_4layers += oof_

time_diff = time() - time_begin

train_4layers[target_cols] = oof_4layers

In [None]:
from time import time

# Averaging on multiple SEEDS
oof_3layers = np.zeros((len(train), len(target_cols)))
predictions_3layers = np.zeros((len(test), len(target_cols)))

time_begin = time()

for fold in range(NFOLDS):
        oof_, pred_ = training_3layersnnet(fold, 3, SEED)
        predictions_3layers += pred_ / NFOLDS
        oof_3layers += oof_

time_diff = time() - time_begin

train_3layers[target_cols] = oof_3layers

In [None]:
folds = process_data(folds)
test  = process_data(test)
categorical_columns = ['cp_dose_D2', 'cp_time_24', 'cp_time_48']
cat_idxs = [i for i, f in enumerate(feature_cols) if f in categorical_columns]
cat_dims = [len(folds[f].unique()) for f in feature_cols if f in categorical_columns]

In [None]:
MAX_EPOCH=200
tabnet_params = dict(n_d=25, n_a=30, cat_idxs=cat_idxs, cat_dims=cat_dims, 
                     n_steps=1, gamma=1.4239102813435471, n_shared=3,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=10,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=2,
                     seed=SEED,
                     )

In [None]:
class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1-y_true)*np.log(1-logits + 1e-15) + y_true*np.log(logits + 1e-15)
        return np.mean(-aux)

In [None]:
X_test = test[feature_cols].values
oof_tabnet = np.zeros((len(train), len(target_cols)))
predictions_tabnet = np.zeros((len(test), len(target_cols)))
scores_tabnet = []

for fold in range(NFOLDS):
    
    print("FOLD %i"%(fold+1), ' ', end='')
    X_train, y_train = folds.loc[folds.kfold != fold, feature_cols].values, folds.loc[folds.kfold != fold, target_cols].values
    X_val_idx = folds.loc[folds.kfold == fold].index
    X_val, y_val = folds.loc[folds.kfold == fold, feature_cols].values, folds.loc[folds.kfold == fold, target_cols].values
    model = TabNetRegressor(**tabnet_params)
    
    model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ["logits_ll"],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=1024, virtual_batch_size=32,
              num_workers=0, drop_last=False,
              # use binary cross entropy as this is not a regression problem
              loss_fn=SmoothBCEwLogits(smoothing=5e-5))
    preds_val = model.predict(X_val)
    # Apply sigmoid to the predictions
    preds =  1 / (1 + np.exp(-preds_val))
    score = np.min(model.history["val_logits_ll"])
#     name = cfg.save_name + f"_fold{fold_nb}"
#     model.save_model(name)
    ## save oof to compute the CV later
    oof_tabnet[X_val_idx] = preds
    scores_tabnet.append(score)

    # preds on test
    preds_test = model.predict(X_test)
    predictions_tabnet += (1 / (1 + np.exp(-preds_test))) / NFOLDS

train_tabnet[target_cols] = oof_tabnet
print(f"Average CV : {np.mean(scores_tabnet)}")
print(f"Std CV : {np.std(scores_tabnet)}")

In [None]:
def show_weights(results):
    print("\n")
    for i, w in enumerate(results.x):
        print(f"weights_{i}: {w}")
        
def sanity_check():
    # All probabilities have to be between 0 and 1.
    if ((blend_func(res.x) > 0) & (blend_func(res.x) < 1)).all():
        print('\nAll probabilities are between 0 and 1. \n    Good to go!')
    else:
        print('\nProbabilities are not between 0 and 1! \nS    Something is wrong!')
        
# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_pred):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, - np.log(y_pred), - np.log(1 - y_pred))
    return loss.mean()

def func_numpy_metric(weights):
    oof_blend = np.tensordot(weights, oofs, axes = ((0), (0)))
    return log_loss_numpy(oof_blend)

In [None]:
from scipy.optimize import minimize

#valid_results_nnet   = train_targets_scored.drop(columns=target_cols).merge(train_nnet[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
#valid_results_tabnet = train_targets_scored.drop(columns=target_cols).merge(train_tabnet[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true        = train_features_scored[target_cols].values
#y_pred_nnet   = valid_results_nnet[target_cols].values
#y_pred_tabnet = valid_results_tabnet[target_cols].values

oofs = np.array([oof_5layers, oof_4layers, oof_3layers, oof_tabnet])

tol = 1e-10
init_guess = [1 / oofs.shape[0]] * oofs.shape[0]
bnds = [(0, 1) for _ in range(oofs.shape[0])]
cons = {'type': 'eq', 
        'fun': lambda x: np.sum(x) - 1, 
        'jac': lambda x: [1] * len(x)}

print('Inital Blend OOF:', func_numpy_metric(init_guess))
start_time = time()
res_scipy = minimize(fun = func_numpy_metric, 
                     x0 = init_guess, 
                     method = 'SLSQP', 
                     bounds = bnds, 
                     constraints = cons,
                     options={'disp': True,
                        'maxiter': 100000},
                     tol = tol)
#print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Blend OOF:', res_scipy.fun)
print('Optimised Weights:', res_scipy.x)

In [None]:
valid_results_5layers   = train_targets_scored.drop(columns=target_cols).merge(train_5layers[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
valid_results_4layers   = train_targets_scored.drop(columns=target_cols).merge(train_4layers[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
valid_results_3layers   = train_targets_scored.drop(columns=target_cols).merge(train_3layers[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
valid_results_tabnet    = train_targets_scored.drop(columns=target_cols).merge(train_tabnet[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)


train_blend[target_cols] = res_scipy.x[0]*oof_5layers + res_scipy.x[1]*oof_4layers + res_scipy.x[2]*oof_3layers + res_scipy.x[3]*oof_tabnet 
valid_blend = train_targets_scored.drop(columns=target_cols).merge(train_blend[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true         = train_targets_scored[target_cols].values
y_pred_5layers = valid_results_5layers[target_cols].values
y_pred_4layers = valid_results_4layers[target_cols].values
y_pred_3layers = valid_results_3layers[target_cols].values
y_pred_tabnet  = valid_results_tabnet[target_cols].values
y_pred_blend   = valid_blend[target_cols].values

score_5layers = 0
score_4layers = 0
score_3layers = 0
score_tabnet = 0
score_blend = 0
for i in range(len(target_cols)):
    score_5layers_ = log_loss(y_true[:, i], y_pred_5layers[:, i]) 
    score_4layers_ = log_loss(y_true[:, i], y_pred_4layers[:, i]) 
    score_3layers_ = log_loss(y_true[:, i], y_pred_3layers[:, i]) 
    tabnet_score_ = log_loss(y_true[:, i], y_pred_tabnet[:, i]) 
    score_blend_ = log_loss(y_true[:, i], y_pred_blend[:, i])
    score_5layers += score_5layers_ / target.shape[1]
    score_4layers += score_4layers_ / target.shape[1]
    score_3layers += score_3layers_ / target.shape[1]
    score_tabnet += tabnet_score_ / target.shape[1]
    score_blend += score_blend_ / target.shape[1]
    
print("CV nnet_5layers log_loss: ", score_5layers)
print("CV nnet_4layers log_loss: ", score_4layers)
print("CV nnet_3layers log_loss: ", score_3layers)
print("CV tabnet log_loss: ", score_tabnet)
print("CV blend loss: ", score_blend)

In [None]:
test[target_cols] = res_scipy.x[0]*predictions_5layers + res_scipy.x[1]*predictions_4layers + res_scipy.x[2]*predictions_3layers + res_scipy.x[3]*predictions_tabnet
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)

In [None]:
sub.shape