In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
import sklearn.preprocessing
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm
import operator

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import random
import os
import sys
from pathlib import Path
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from sklearn.decomposition import PCA

In [None]:
import numpy as np

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, \
    BaseShuffleSplit, _validate_shuffle_split


In [None]:
seed = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(seed)

In [None]:
import numpy as np
from joblib import Parallel, delayed
from scipy.interpolate import interp1d
from scipy.special import erf, erfinv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted


class GaussRankScaler(BaseEstimator, TransformerMixin):
    """Transform features by scaling each feature to a normal distribution.
    Parameters
        ----------
        epsilon : float, optional, default 1e-4
            A small amount added to the lower bound or subtracted
            from the upper bound. This value prevents infinite number
            from occurring when applying the inverse error function.
        copy : boolean, optional, default True
            If False, try to avoid a copy and do inplace scaling instead.
            This is not guaranteed to always work inplace; e.g. if the data is
            not a NumPy array, a copy may still be returned.
        n_jobs : int or None, optional, default None
            Number of jobs to run in parallel.
            ``None`` means 1 and ``-1`` means using all processors.
        interp_kind : str or int, optional, default 'linear'
           Specifies the kind of interpolation as a string
            ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
            'previous', 'next', where 'zero', 'slinear', 'quadratic' and 'cubic'
            refer to a spline interpolation of zeroth, first, second or third
            order; 'previous' and 'next' simply return the previous or next value
            of the point) or as an integer specifying the order of the spline
            interpolator to use.
        interp_copy : bool, optional, default False
            If True, the interpolation function makes internal copies of x and y.
            If False, references to `x` and `y` are used.
        Attributes
        ----------
        interp_func_ : list
            The interpolation function for each feature in the training set.
        """

    def __init__(self, epsilon=1e-4, copy=True, n_jobs=None, interp_kind='linear', interp_copy=False):
        self.epsilon = epsilon
        self.copy = copy
        self.interp_kind = interp_kind
        self.interp_copy = interp_copy
        self.fill_value = 'extrapolate'
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Fit interpolation function to link rank with original data for future scaling
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to fit interpolation function for later scaling along the features axis.
        y
            Ignored
        """
        X = check_array(X, copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        self.interp_func_ = Parallel(n_jobs=self.n_jobs)(delayed(self._fit)(x) for x in X.T)
        return self

    def _fit(self, x):
        x = self.drop_duplicates(x)
        rank = np.argsort(np.argsort(x))
        bound = 1.0 - self.epsilon
        factor = np.max(rank) / 2.0 * bound
        scaled_rank = np.clip(rank / factor - bound, -bound, bound)
        return interp1d(
            x, scaled_rank, kind=self.interp_kind, copy=self.interp_copy, fill_value=self.fill_value)

    def transform(self, X, copy=None):
        """Scale the data with the Gauss Rank algorithm
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _transform(self, i, x):
        return erfinv(self.interp_func_[i](x))

    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        copy : bool, optional (default: None)
            Copy the input X or not.
        """
        check_is_fitted(self, 'interp_func_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite=True)

        X = np.array(Parallel(n_jobs=self.n_jobs)(delayed(self._inverse_transform)(i, x) for i, x in enumerate(X.T))).T
        return X

    def _inverse_transform(self, i, x):
        inv_interp_func = interp1d(self.interp_func_[i].y, self.interp_func_[i].x, kind=self.interp_kind,
                                   copy=self.interp_copy, fill_value=self.fill_value)
        return inv_interp_func(erf(x))

    @staticmethod
    def drop_duplicates(x):
        is_unique = np.zeros_like(x, dtype=bool)
        is_unique[np.unique(x, return_index=True)[1]] = True
        return x[is_unique]

import ctypes
ctypes.cdll.LoadLibrary('caffe2_nvrtc.dll')

In [None]:
p_min = 1e-15
p_max = 1 - p_min

def score(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    y_pred = np.clip(y_pred, p_min, p_max)
    return -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)).mean()

In [None]:
train_X = pd.read_csv('../input/lish-moa/train_features.csv', index_col='sig_id')
test_Y = pd.read_csv('../input/lish-moa/sample_submission.csv', index_col='sig_id')
train_Y = pd.read_csv('../input/lish-moa/train_targets_scored.csv', index_col='sig_id', dtype={f: test_Y.dtypes[f] for f in test_Y})
test_X = pd.read_csv('../input/lish-moa/test_features.csv', index_col='sig_id')
drug_ids = pd.read_csv('../input/lish-moa/train_drug.csv', index_col='sig_id')

In [None]:
train_X.cp_time = train_X.cp_time / 24
test_X.cp_time = test_X.cp_time / 24

train_X['real_drug'] = train_X.cp_type == 'trt_cp'
test_X['real_drug'] = test_X.cp_type == 'trt_cp'

t = train_X.cp_dose.copy()
train_X.drop(columns=['cp_dose', 'cp_type'], inplace=True)
train_X['cp_dose'] = 1
train_X.loc[(t == 'D2'), 'cp_dose'] = 2

t = test_X.cp_dose.copy()
test_X.drop(columns=['cp_dose', 'cp_type'], inplace=True)
test_X['cp_dose'] = 1
test_X.loc[(t == 'D2'), 'cp_dose'] = 2

In [None]:
nfolds = 12
nstarts = 1
nepochs = 50
batch_size = 128
val_batch_size = batch_size * 4
criterion = nn.BCELoss()
kfold = GroupKFold(n_splits=nfolds)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class Dataset_my(Dataset):
    def __init__(self, df, targets, mode='train'):
        self.mode = mode
        #self.feats = feats_idx
        #self.data = df[:, feats_idx]
        self.data = df
        if mode=='train':
            self.targets = targets
    
    def __getitem__(self, idx):
        if self.mode == 'train':
            return torch.FloatTensor(self.data[idx]), torch.FloatTensor(self.targets[idx])
        elif self.mode == 'test':
            return torch.FloatTensor(self.data[idx]), 0
        
    def __len__(self):
        return len(self.data)

In [None]:
def run_CV_for_model(cur_model, model_num, train_X_loc, train_Y_loc, test_X_loc, labels_loc):
    set_seed(seed)
    if len(train_X_loc) > len(labels_loc):
        t = len(labels_loc)
        labels_loc = np.resize(labels_loc, train_X_loc.shape[0])
        labels_loc[t:] = placebo_label
        
    for n, (tr, te) in enumerate(kfold.split(train_Y_loc, train_Y_loc, groups=labels_loc)):
        print(f'Train fold {n+1}')
        xtrain, xval = train_X_loc[tr], train_X_loc[te]
        ytrain, yval = train_Y_loc[tr], train_Y_loc[te]

        train_set = Dataset_my(xtrain, ytrain)
        val_set = Dataset_my(xval, yval)

        dataloaders = {
            'train': DataLoader(train_set, batch_size=batch_size, shuffle=True),
            'val': DataLoader(val_set, batch_size=val_batch_size, shuffle=False)
        }

        model = cur_model(train_X_loc.shape[1]).to(device)
        Path(f'./saved_params/model{model_num}').mkdir(parents=True, exist_ok=True)
        checkpoint_path = f'./saved_params/model{model_num}/repeat_{1}_Fold_{n+1}.pt'
        optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        best_loss = {'train': np.inf, 'val': np.inf}

        for epoch in range(nepochs):
            epoch_loss = {'train': 0.0, 'val': 0.0}

            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()
                else:
                    model.eval()

                running_loss = 0.0

                for i, (x, y) in enumerate(dataloaders[phase]):
                    x, y = x.to(device), y.to(device)

                    optimizer.zero_grad()

                    with torch.set_grad_enabled(phase=='train'):
                        preds = model(x)
                        loss = criterion(preds, y)

                        if phase=='train':
                            loss.backward()
                            optimizer.step()

                    running_loss += loss.item() / len(dataloaders[phase])

                epoch_loss[phase] = running_loss

            print("Epoch {}/{}   -   loss: {:5.5f}   -   val_loss: {:5.5f}".format(epoch+1, nepochs, epoch_loss['train'], epoch_loss['val']))

            scheduler.step(epoch_loss['val'])

            if epoch_loss['val'] < best_loss['val']:
                best_loss = epoch_loss
                torch.save(model.state_dict(), checkpoint_path)
    return best_loss

In [None]:
def run_predict_for_model(cur_model, model_num, test_X_loc):
    preds = np.zeros((test_X_loc.shape[0], test_Y.shape[1], nfolds))
    
    for n in range(nfolds):
        test_set = Dataset_my(test_X_loc, None, mode='test')
        dataloader = DataLoader(test_set, batch_size=val_batch_size, shuffle=False)
        
        checkpoint_path = f'../input/notebook83e8d90dce/saved_params/model{model_num}/repeat_{1}_Fold_{n+1}.pt'
        model = cur_model(test_X_loc.shape[1]).to(device)
        model.load_state_dict(torch.load(checkpoint_path))
        model.eval()
        
        fold_preds = []
        for i, (x, y) in enumerate(dataloader):
            x = x.to(device)

            with torch.no_grad():
                fold_preds.append(model(x))
            
        fold_preds = torch.cat(fold_preds, dim=0).cpu().numpy()
        preds[:, :, n] = fold_preds
    preds = preds.mean(axis=2)
    return preds

In [None]:
class Model4(nn.Module):
    def __init__(self, num_columns):
        super(Model4, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 2048))
        
        self.batch_norm2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(2048, 1024))
        
        self.batch_norm3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1024, 206))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = torch.sigmoid(self.dense3(x))
        
        return x

In [None]:
t = test_Y[test_X['real_drug'] == False].copy()
for f in t:
    t[f] = 0
train_Y4 = train_Y.reset_index(drop=True).append(t)
train_X4 = train_X.reset_index(drop=True)
test_X4 = test_X

all_X4 = train_X4.append(test_X4).drop(columns=['real_drug'])

features_g = [col for col in train_X4.columns if 'g-' in col]
features_c = [col for col in train_X4.columns if 'c-' in col]

all_X4['g_sum'] = all_X4[features_g].sum(axis = 1)
all_X4['g_mean'] = all_X4[features_g].mean(axis = 1)
all_X4['g_std'] = all_X4[features_g].std(axis = 1)
all_X4['g_kurt'] = all_X4[features_g].kurtosis(axis = 1)
all_X4['g_skew'] = all_X4[features_g].skew(axis = 1)
all_X4['c_sum'] = all_X4[features_c].sum(axis = 1)
all_X4['c_mean'] = all_X4[features_c].mean(axis = 1)
all_X4['c_std'] = all_X4[features_c].std(axis = 1)
all_X4['c_kurt'] = all_X4[features_c].kurtosis(axis = 1)
all_X4['c_skew'] = all_X4[features_c].skew(axis = 1)
all_X4['gc_sum'] = all_X4[features_g + features_c].sum(axis = 1)
all_X4['gc_mean'] = all_X4[features_g + features_c].mean(axis = 1)
all_X4['gc_std'] = all_X4[features_g + features_c].std(axis = 1)
all_X4['gc_kurt'] = all_X4[features_g + features_c].kurtosis(axis = 1)
all_X4['gc_skew'] = all_X4[features_g + features_c].skew(axis = 1)



In [None]:
scaler = GaussRankScaler()
all_X4 = scaler.fit_transform(all_X4)

In [None]:
pca_transformer = PCA(687)
all_X4 = pca_transformer.fit_transform(all_X4)

In [None]:
train_X4 = all_X4[:train_X4.shape[0]]
test_X4 = all_X4[train_X4.shape[0]:]

In [None]:
train_X4 = np.vstack([train_X4, test_X4[test_X['real_drug'] == False]])

In [None]:
alpha_smoothing = 0.001
train_Y4 = train_Y4.values
train_Y4 = (1 - alpha_smoothing) * train_Y4 + alpha_smoothing * train_Y4.mean(axis=1)[:, None]

In [None]:
encoder = sklearn.preprocessing.LabelEncoder().fit(drug_ids)
drug_labels = encoder.transform(drug_ids)
placebo_label = encoder.transform(['cacb2b860'])[0]

In [None]:
run_CV_for_model(Model4, 4, train_X4, train_Y4, test_X4, drug_labels)

In [None]:
train_P4 = run_predict_for_model(Model4, 4, train_X4)

In [None]:
train_P4.shape, train_Y4.shape

In [None]:
score(train_Y4, train_P4)

In [None]:
train_P4[:train_X.shape[0]][train_X['real_drug'] == False] = 0
train_P4[train_X.shape[0]] = 0

In [None]:
score(train_Y4, train_P4)

In [None]:
test_P = run_predict_for_model(Model4, 4, test_X4)

In [None]:
test_Y[list(test_Y.columns)] = test_P
test_Y[test_X['real_drug'] == False] = 0

In [None]:
test_Y

In [None]:
test_Y.values.mean()

In [None]:
(test_Y.values ** 0.5).mean()

In [None]:
l = 0.5
r = 1
while r - l > 0.000001:
    m = (l + r) / 2
    if (test_Y.values ** m).mean() > 0.003785:
        l = m
    else:
        r = m

In [None]:
(test_Y.values ** r).mean()

In [None]:
test_Y[list(test_Y.columns)] = test_Y ** r

In [None]:
test_Y.values.mean()

In [None]:
test_Y.values.max()

In [None]:
test_Y.to_csv('submission.csv')