This notebook is based on https://www.kaggle.com/muellerzr/fastai-tabular @Zach Mueller and https://www.kaggle.com/ppicheta/lish-moa-drug-aware-multilabelstratifiedkfold @Piotr Picheta

In [None]:
#need to uninstall fastai1
!pip uninstall fastai -y

#Need to add fast-v2-offline dataset to your input before install
!pip install -q /kaggle/input/fast-v2-offline/dataclasses-0.6-py3-none-any.whl
!pip install -q /kaggle/input/fast-v2-offline/torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install -q /kaggle/input/fast-v2-offline/torchvision-0.7.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install -q /kaggle/input/fast-v2-offline/fastcore-1.0.1-py3-none-any.whl
!pip install -q /kaggle/input/fast-v2-offline/fastai-2.0.8-py3-none-any.whl

In [None]:
from fastai.tabular.all import *

In [None]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import pandas as pd
from sklearn.model_selection._split import _BaseKFold

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv').set_index('sig_id')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').set_index('sig_id')
test_features = pd.read_csv('../input/lish-moa/test_features.csv').set_index('sig_id')
drug = pd.read_csv('../input/lish-moa/train_drug.csv').set_index('sig_id')


In [None]:
ss = pd.read_csv('../input/lish-moa/sample_submission.csv').set_index('sig_id')

In [None]:
train = train_features.merge(drug, left_index=True, right_index=True)

In [None]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24:0, 48:0.5, 72:1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    return df

In [None]:
train = preprocess(train)
test = preprocess(test_features)

In [None]:
targets=train_targets.loc[train.cp_type == 'trt_cp']
train = train.loc[train.cp_type == 'trt_cp']

In [None]:
cont_names = list(train.columns[1:-1])
y_names = list(targets.columns)

In [None]:
#need to merge train and targets to a single dataframe
train2 = train.merge(targets, left_index=True, right_index=True)

In [None]:
nepochs = 50
batch_size = 128
val_batch_size = batch_size * 4
criterion = nn.BCELoss()

In [None]:
class DrugAwareMultilabelStratifiedKFold(_BaseKFold):

    SAMPLES_PER_EXPERIMENT = 6

    def __init__(self,
                 max_experiment_cnt=3,
                 n_splits=3,
                 shuffle=False,
                 random_state=None):
        super().__init__(n_splits=n_splits, 
                         shuffle=shuffle, 
                         random_state=random_state)
        self._skf = MultilabelStratifiedKFold(n_splits=n_splits, 
                                              shuffle=shuffle, 
                                              random_state=random_state)
        self.drug_threshold = self.SAMPLES_PER_EXPERIMENT * max_experiment_cnt

    def _iter_test_indices(self, X=None, y=None, groups=None):
        drug_set = X.merge(y, left_index=True, right_index=True)
        targets = y.columns
        vc = X['drug_id'].value_counts()
        vc1 = vc.loc[vc <= self.drug_threshold].index.sort_values()
        vc2 = vc.loc[vc > self.drug_threshold].index.sort_values()

        drug_id_to_fold = {}
        sig_id_to_fold = {}
        if len(vc1) > 0:
            tmp = drug_set.groupby('drug_id')[targets].mean().loc[vc1]
            for fold, (_, idx_val) in enumerate(self._skf.split(tmp, tmp[targets])):
                drug_id_to_fold.update({k: fold for k in tmp.index[idx_val].values})

        if len(vc2) > 0:
            tmp = drug_set.loc[drug_set.drug_id.isin(vc2)].reset_index()
            for fold, (_, idx_val) in enumerate(self._skf.split(tmp, tmp[targets])):
                sig_id_to_fold.update({k: fold for k in tmp.sig_id[idx_val].values})

        drug_set['fold'] = drug_set.drug_id.map(drug_id_to_fold)
        unset_folds = drug_set.fold.isna()
        drug_set.loc[unset_folds, 'fold'] = drug_set.loc[unset_folds].index.map(sig_id_to_fold)
        test_folds = drug_set.fold.astype('int8').values

        for i in range(self.n_splits):
            yield test_folds == i

In [None]:
class MoaModel(nn.Module):
    def __init__(self, num_columns):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.35016565859755877)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 221))
        
        self.batch_norm2 = nn.BatchNorm1d(221)
        self.dropout2 = nn.Dropout(0.3831830207882558)
        self.dense2 = nn.utils.weight_norm(nn.Linear(221, 775))
        
        self.batch_norm3 = nn.BatchNorm1d(775)
        self.dropout3 = nn.Dropout(0.37312149555800084)
        self.dense3 = nn.utils.weight_norm(nn.Linear(775, 206))
    
    def forward(self, cat, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = Mish()(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = Mish()(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.sigmoid(self.dense3(x))
        
        return x

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

In [None]:
nstarts=1 #starts from 1
all_preds = []
for seed in range(nstarts):
  seed_preds = []
  print(f'Train seed {seed}')
  set_seed(seed) 
  
  splitter = DrugAwareMultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
  for n, (tr, te) in enumerate(splitter.split(X=train, y=targets)): #train and targets data should not be overlapped
      print(f'Train fold {n+1}')
      splits = (L(list(tr)), L(list(te)))
    # train2 must be a combination of train and targets dataframe
    # cont_names and y_names must be in list form
      to = TabularPandas(train2,[Normalize, FillMissing], cat_names = [],cont_names=cont_names, y_names=y_names, splits=splits)
      trn_dl = TabDataLoader(to.train, bs=512, shuffle=True, drop_last=True)
      val_dl = TabDataLoader(to.valid, bs=512*4)
      dls = TabularDataLoaders(trn_dl, val_dl).cuda()
      model = MoaModel(874).cuda()
      learn = Learner(dls, model, loss_func=BCELossFlat(), opt_func=ranger)
      learn.fit_flat_cos(20, 0.039745218935223835, cbs=[EarlyStoppingCallback(), ReduceLROnPlateau()])
      dl = learn.dls.test_dl(test.copy()) #test df must has the same form as the train df, so put cp_dose/time to number
      seed_preds.append(learn.get_preds(dl=dl)[0].cpu().numpy())
  all_preds.append(np.mean(seed_preds, axis=0))
preds = np.mean(all_preds, axis=0)

In [None]:
ss[y_names] = preds
ss.loc[test.cp_type =='ctl_vehicle', y_names] = 0
ss = ss.reset_index()
ss.to_csv('submission.csv', index =False)