In [None]:
# for TPU
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
# for TPU
# import torch_xla
# import torch_xla.core.xla_model as xm

In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import time
import math, random
import gc, os
import datetime
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import Nystroem
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import FeatureAgglomeration, AgglomerativeClustering, KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

cols = [c for c in ss.columns.values if c != 'sig_id']
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
def preprocess(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
#     df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

# def log_loss_metric(y_true, y_pred):
#     metrics = []
#     for _target in train_targets.columns:
#         metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
#     return np.mean(metrics)

def log_loss_metric(y_true, y_pred):
    loss = 0
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    for i in range(y_true.shape[1]):
        loss += - np.mean(y_true[:, i] * np.log(y_pred_clip[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / y_true.shape[1]

train = preprocess(train_features)
test = preprocess(test_features)

del train_targets['sig_id']
del train_targets_nonscored['sig_id']

In [None]:
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer(output_distribution = 'normal', random_state = 42)
qt.fit(pd.concat([pd.DataFrame(train[GENES+CELLS]), pd.DataFrame(test[GENES+CELLS])]))
train[GENES+CELLS] = qt.transform(train[GENES+CELLS])
test[GENES+CELLS] = qt.transform(test[GENES+CELLS])

In [None]:
from sklearn.decomposition import PCA

# GENES
n_comp_genes = 600  #<--Update

data = pd.concat([pd.DataFrame(train[GENES]), pd.DataFrame(test[GENES])])
pca_genes = PCA(n_components=n_comp_genes, random_state = 42)
data2 = pca_genes.fit_transform(data[GENES])
train2 = data2[:train.shape[0]]; test2 = data2[-test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp_genes)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp_genes)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

#CELLS
n_comp_cells = 50  #<--Update

data = pd.concat([pd.DataFrame(train[CELLS]), pd.DataFrame(test[CELLS])])
pca_cells = PCA(n_components=n_comp_cells, random_state = 42)
data2 = pca_cells.fit_transform(data[CELLS])
train2 = data2[:train.shape[0]]; test2 = data2[-test.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp_cells)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp_cells)])

train = pd.concat((train, train2), axis=1)
test = pd.concat((test, test2), axis=1)

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)  #<-- Update
data = train.append(test)
data_transformed = var_thresh.fit_transform(data.iloc[:, 3:])

train_transformed = data_transformed[ : train.shape[0]]
test_transformed = data_transformed[-test.shape[0] : ]

train = pd.DataFrame(train[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
            columns=['cp_type','cp_time','cp_dose'])

train = pd.concat([train, pd.DataFrame(train_transformed)], axis=1)

test = pd.DataFrame(test[['cp_type','cp_time','cp_dose']].values.reshape(-1, 3),\
            columns=['cp_type','cp_time','cp_dose'])

test = pd.concat([test, pd.DataFrame(test_transformed)], axis=1)

print(train.shape)
print(test.shape)

In [None]:
train_targets = train_targets.loc[train['cp_type'] == 0].reset_index(drop = True)
train_targets_nonscored = train_targets_nonscored.loc[train['cp_type'] == 0].reset_index(drop = True)
train = train.loc[train['cp_type'] == 0].reset_index(drop = True)

print(train.shape)

In [None]:
top_feats = np.arange(1, train.shape[1])
print(top_feats)

In [None]:
train.head()

In [None]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss
    
sbcewlogits = SmoothBCEwLogits(smoothing = 0.0008)

In [None]:
def create_folds(num_starts, num_splits):
    
    folds = []
    
    # LOAD FILES
    train_feats = pd.read_csv('../input/lish-moa/train_features.csv')
    scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
    scored = scored.loc[train_feats['cp_type'] == 'trt_cp', :]
    drug = drug.loc[train_feats['cp_type'] == 'trt_cp', :]
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()
    
    for seed in range(num_starts):

        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}; dct2 = {}
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop = True)
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        scored['fold'] = scored.drug_id.map(dct1)
        scored.loc[scored.fold.isna(),'fold'] =\
            scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
        scored.fold = scored.fold.astype('int8')
        folds.append(scored.fold.values)
        
        del scored['fold']
        
    return np.stack(folds)

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(42)

In [None]:
nfolds = 5
nstarts = 3
nepochs = 1000
batch_size = 128
val_batch_size = 1024
ntargets = train_targets.shape[1]
targets = [col for col in train_targets.columns]
criterion_train = sbcewlogits
criterion_val = nn.BCELoss()

# for GPU/CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# for TPU
# device = xm.xla_device()
# torch.set_default_tensor_type('torch.FloatTensor')

In [None]:
class SwishFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        y = x * torch.sigmoid(x)
        return y

    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_variables[0]
        sigmoid = torch.sigmoid(x)
        return grad_output * (sigmoid * (1 + x * (1 - sigmoid)))
F_swish = SwishFunction.apply

class Swish(nn.Module):
    def forward(self, x):
        return F_swish(x)

class GroupLinear(nn.Module):
    def __init__(self, in_dim, out_dim, group=4):
        super(GroupLinear, self).__init__()
        self.group = group
        self.linear = nn.Linear(in_dim//group, out_dim//group)

    def forward(self, x):
        batch_size, dim = x.shape
        g = self.group
        x = x.reshape(batch_size, g, dim//g)
        x = self.linear(x)
        x = x.reshape(batch_size, -1)
        return x

class ResAdd(nn.Module):
    def __init__(self, module):
        super(ResAdd, self).__init__()
        self.module = module

    def forward(self, x):
        x = self.module(x)+x
        return x

# class Net(nn.Module):
#     def __init__(self, num_feats, num_targets, 
#                  hidden_units = [512, 128, 128, 256, 512], 
#                  dropout_rates = [0.22023758845836414, 0.44095268503187185, 0.33039674088592896, 0.17879173200029594, 0.31790411449084355], 
#                  num_groups = [16, 4, 16, 8]):
#         super(Net, self).__init__()
#         self.resnet = nn.Sequential(
#             nn.BatchNorm1d(num_feats),
#             nn.Dropout(dropout_rates[0]),
#             nn.Linear(num_feats, hidden_units[0]),    # mix
#             GroupLinear(hidden_units[0], hidden_units[1], num_groups[0]),   # group
#             nn.BatchNorm1d(hidden_units[1]),
#             Swish(),
#             nn.Dropout(dropout_rates[1]),#

#             ResAdd(nn.Sequential(
#                 nn.Linear(hidden_units[1], hidden_units[2]),      # mix
#                 GroupLinear(hidden_units[2], hidden_units[1], num_groups[1]), # group
#                 nn.BatchNorm1d(hidden_units[1]),
#                 Swish(),
#                 nn.Dropout(dropout_rates[2]),
#             )),
#             ResAdd(nn.Sequential(
#                 nn.Linear(hidden_units[1], hidden_units[3]),      # mix
#                 GroupLinear(hidden_units[3], hidden_units[1], num_groups[2]), # group
#                 nn.BatchNorm1d(hidden_units[1]),
#                 Swish(),
#                 nn.Dropout(dropout_rates[3]),
#             )),
#             ResAdd(nn.Sequential(
#                 nn.Linear(hidden_units[1], hidden_units[4]),      # mix
#                 GroupLinear(hidden_units[4], hidden_units[1], num_groups[3]), # group
#                 nn.BatchNorm1d(hidden_units[1]),
#                 Swish(),
#                 nn.Dropout(dropout_rates[4]),
#             )),
#         )

#         self.logit = nn.Linear(hidden_units[1], num_targets)

#     def forward(self, x):
#         x = self.resnet(x)
#         return self.logit(x)

class Net(nn.Module):
    def __init__(self, num_feats, num_targets, hidden_units = [512, 256], 
                 dropout_rates = [0.3, 0.2, 0.2], 
                 num_groups = [16, 8]):
        super(Net, self).__init__()
        self.resnet = nn.Sequential(
            nn.BatchNorm1d(num_feats),
            nn.Dropout(dropout_rates[0]),
            nn.Linear(num_feats, hidden_units[0]),    # mix
            GroupLinear(hidden_units[0], hidden_units[1], num_groups[0]),   # group
            nn.BatchNorm1d(hidden_units[1]),
            Swish(),
            nn.Dropout(dropout_rates[1]),#

            ResAdd(nn.Sequential(
                nn.Linear(hidden_units[1], hidden_units[1]),      # mix
                GroupLinear(hidden_units[1], hidden_units[1], num_groups[1]), # group
                nn.BatchNorm1d(hidden_units[1]),
                Swish(),
                nn.Dropout(dropout_rates[2]),
            )),
            ResAdd(nn.Sequential(
                nn.Linear(hidden_units[1], hidden_units[1]),      # mix
                GroupLinear(hidden_units[1], hidden_units[1], num_groups[1]), # group
                nn.BatchNorm1d(hidden_units[1]),
                Swish(),
                nn.Dropout(dropout_rates[2]),
            )),
#             ResAdd(nn.Sequential(
#                 nn.Linear(hidden_units[1], hidden_units[1]),      # mix
#                 GroupLinear(hidden_units[1], hidden_units[1], num_groups[1]), # group
#                 nn.BatchNorm1d(hidden_units[1]),
#                 Swish(),
#                 nn.Dropout(dropout_rates[2]),
#             )),
#             ResAdd(nn.Sequential(
#                 nn.Linear(hidden_units[1], hidden_units[1]),      # mix
#                 GroupLinear(hidden_units[1], hidden_units[1], num_groups[1]), # group
#                 nn.BatchNorm1d(hidden_units[1]),
#                 Swish(),
#                 nn.Dropout(dropout_rates[2]),
#             )),
        )

        self.logit = nn.Linear(hidden_units[1], num_targets)

    def forward(self, x):
        x = self.resnet(x)
        return self.logit(x)

# Features

In [None]:
# dataset class
class MoaDataset(Dataset):
    def __init__(self, df, targets, feats_idx, mode='train'):
        self.mode = mode
        self.feats = feats_idx
        self.data = df[:, feats_idx]
        if mode=='train':
            self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if self.mode == 'train':
            return torch.FloatTensor(self.data[idx]), torch.FloatTensor(self.targets[idx])
        elif self.mode == 'test':
            return torch.FloatTensor(self.data[idx]), 0

In [None]:
ss_pseudo = pd.read_csv('../input/drug-sub/submission_pbest.csv').drop('sig_id', axis = 1)
pseudo_targets = ss_pseudo.loc[test['cp_type'] == 0, cols].values
pseudo_train = test.loc[test['cp_type'] == 0, test.columns].values

In [None]:
train = train.values
test = test.values
train_targets = train_targets.values

# Training

The model I use here is different from the one suggested in the paper. Here, I flatten all the outputs of the trees and pass a final dense layer. I found this approach can accelerate the convergence and generate better results.

P.S. Using more trees significantly increases the training time but no obvious improvement on CV and LB.

In [None]:
folds_split = create_folds(nstarts, nfolds)
print(folds_split)

In [None]:
for nums, seed in enumerate(range(nstarts)):
    seed_everything(seed)
#     kfold = MultilabelStratifiedKFold(n_splits = nfolds, random_state = seed, shuffle = True)
    for n, foldno in enumerate(set(folds_split[nums])):
        tr = folds_split[nums] != foldno
        te = folds_split[nums] == foldno
        start_time = time.time()
        xtrain, xval = train[tr], train[te]
        ytrain, yval = train_targets[tr], train_targets[te]
        
        # Pseudo Labeling
        xtrain = np.concatenate([xtrain, pseudo_train])
        ytrain = np.concatenate([ytrain, pseudo_targets])

        train_set = MoaDataset(xtrain, ytrain, top_feats)
        val_set = MoaDataset(xval, yval, top_feats)

        dataloaders = {
            'train': DataLoader(train_set, batch_size=batch_size, shuffle=True),
            'val': DataLoader(val_set, batch_size=val_batch_size, shuffle=False)
        }
        
        model = Net(len(top_feats), 206).to(device)
        checkpoint_path = f'ResDT_{seed}_Fold_{n+1}.pt'
        optimizer = optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.1, 
                                                         patience = 3, eps = 1e-4, verbose = False)
        best_loss = {'train': np.inf, 'val': np.inf}
        
        es_count = 0
        for epoch in range(nepochs):
            epoch_loss = {'train': 0.0, 'val': 0.0}

            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()
                else:
                    model.eval()

                running_loss = 0.0

                for i, (x, y) in enumerate(dataloaders[phase]):
                    x, y = x.to(device), y.to(device)

                    optimizer.zero_grad()

                    with torch.set_grad_enabled(phase=='train'):
                        if phase=='train':
                            preds = model(x)
                            loss = criterion_train(preds, y)
                        else:
                            preds = torch.sigmoid(model(x))
                            loss = criterion_val(preds, y)

                        if phase=='train':
                            loss.backward()
                            optimizer.step()
#                             xm.optimizer_step(optimizer, barrier = True)

                    running_loss += loss.item() / len(dataloaders[phase])

                epoch_loss[phase] = running_loss
    
            scheduler.step(epoch_loss['val'])

            if epoch_loss['val'] < best_loss['val']:
                best_loss = epoch_loss
                torch.save(model.state_dict(), checkpoint_path)
                es_count = 0
            else:
                es_count += 1
                
#             print("Epoch {}/{} - loss: {:5.5f} - val_loss: {:5.5f} - es: {}".format(epoch+1, nepochs, epoch_loss['train'], epoch_loss['val'], es_count))
            
            if es_count > 10:
                break
        
        print("[{}] - seed: {} - fold: {} - best val_loss: {:5.5f}".format(str(datetime.timedelta(seconds = time.time() - start_time))[0:7], seed, n, best_loss['val']))

In [None]:
oof = np.zeros((len(train), nstarts, ntargets))
oof_targets = np.zeros((len(train), ntargets))
preds = np.zeros((len(test), ntargets))

In [None]:
def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(targets):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

# Inference

In [None]:
res = np.zeros(train_targets.shape)
for nums, seed in enumerate(range(nstarts)):
    print(f"Inference for seed {seed}")
    seed_targets = []
    seed_oof = []
    seed_preds = np.zeros((len(test), ntargets, nfolds))
    
    for n, foldno in enumerate(set(folds_split[nums])):
        tr = folds_split[nums] != foldno
        te = folds_split[nums] == foldno
        xval, yval = train[te], train_targets[te]
        fold_preds = []
        fold_oof = []
        
        val_set = MoaDataset(xval, yval, top_feats)
        test_set = MoaDataset(test, None, top_feats, mode='test')
        
        dataloaders = {
            'val': DataLoader(val_set, batch_size=val_batch_size, shuffle=False),
            'test': DataLoader(test_set, batch_size=val_batch_size, shuffle=False)
        }
        
        checkpoint_path = f'ResDT_{seed}_Fold_{n+1}.pt'
        model = Net(len(top_feats), 206).to(device)
        model.load_state_dict(torch.load(checkpoint_path, map_location = device))
        model.eval()
        
        for phase in ['val', 'test']:
            for i, (x, y) in enumerate(dataloaders[phase]):
                if phase == 'val':
                    x, y = x.to(device), y.to(device)
                elif phase == 'test':
                    x = x.to(device)
                
                with torch.no_grad():
                    batch_preds = torch.sigmoid(model(x))
                    
                    if phase == 'val':
                        seed_targets.append(y)
                        seed_oof.append(batch_preds)
                        fold_oof.append(batch_preds)
                    elif phase == 'test':
                        fold_preds.append(batch_preds)
        
        fold_oof = torch.cat(fold_oof, dim=0).cpu().numpy()
        print(f'Score of seed {seed} fold {n}:\t', log_loss_metric(train_targets[te], fold_oof))
        res[te] += fold_oof / nstarts
        fold_preds = torch.cat(fold_preds, dim=0).cpu().numpy()
        seed_preds[:, :, n] = fold_preds
        
    seed_targets = torch.cat(seed_targets, dim=0).cpu().numpy()
    seed_oof = torch.cat(seed_oof, dim=0).cpu().numpy()
    seed_preds = np.mean(seed_preds, axis=2)
    
#     print("Score for this seed {:5.5f}".format(mean_log_loss(seed_targets, seed_oof)))
    oof_targets = seed_targets
    oof[:, seed, :] = seed_oof
    preds += seed_preds / nstarts

In [None]:
print(f'Overall OOF Score:\t', log_loss_metric(train_targets, res))
np.save('Net_oof.npy', res)

In [None]:
# OOF CV Score With Control Group
tr_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv').drop('sig_id', axis = 1)
res_all = np.zeros(tr_targets[cols].shape)
res_all[train_features['cp_type'] == 0] = res.copy()
overall_oof_score = log_loss_metric(tr_targets[cols].values, res_all)
print('OOF CV Score With Control Group:', overall_oof_score)

In [None]:
ss[targets] = preds
ss.loc[test_features['cp_type'] == 1, targets] = 0
ss.to_csv('submission.csv', index = False)
np.save('Net_sub.npy', ss.loc[test_features['cp_type'] == 0, cols].values)