In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

sys.path.append('../input/autograd')
from autograd import grad
import autograd.numpy as anp
from scipy.optimize import fsolve
import datetime

import os
import gc
import math
from time import time
import random

import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.tensorboard import SummaryWriter       
writer = SummaryWriter('./log')
if not os.path.exists('./log'):
    os.makedirs('log')

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

In [None]:
def load_data(path = '../input/lish-moa/'):
    train_features = pd.read_csv(os.path.join(path, 'train_features.csv'))
    test_features = pd.read_csv(os.path.join(path, 'test_features.csv'))
    Y0 = pd.read_csv(os.path.join(path, 'train_targets_nonscored.csv'))
    Y = pd.read_csv(os.path.join(path, 'train_targets_scored.csv'))
    submission = pd.read_csv(os.path.join(path, 'sample_submission.csv'))
    
    keep_rows = train_features['cp_type']!='ctl_vehicle'
    train = train_features.copy()
    train = train[keep_rows].reset_index(drop=True)
    origin_Y = Y.copy()
    origin_Y = origin_Y.drop('sig_id', axis=1)
    Y0 = Y0[keep_rows].reset_index(drop=True) # nonscored pretrain
    Y0 = Y0.drop('sig_id', axis=1)
    Y = Y[keep_rows].reset_index(drop=True)
    Y = Y.drop('sig_id', axis=1)
    submission.iloc[:,1:] = 0

    # label smoothing
    Y_smooth = smooth_one_hot(Y, classes=2, smoothing=0.001)
    Y0_smooth = smooth_one_hot(Y0, classes=2, smoothing=0.001)
    return train, test_features, Y0, Y, Y0_smooth, Y_smooth, submission, origin_Y

In [None]:
def smooth_one_hot(Y, classes: int, smoothing=0.001):
    """
    Y_train : one-hot encoding dataframe
    if smoothing == 0, it's one-hot method
    if 0 < smoothing < 1, it's smooth method

    confidence = 1.0 - label_smoothing
    return y_true * confidence + (label_smoothing / num_classes)
    """
    assert 0 <= smoothing < 1
    Y_smooth = Y.copy()
    confidence = 1.0 - smoothing
    Y_smooth.replace(1, confidence + smoothing / classes, inplace=True)
    Y_smooth.replace(0, smoothing/classes, inplace=True)
    return Y_smooth

In [None]:
def apply_rankgauss(df1, df2, n_quantiles, ouput_distribution):
    GENES = [col for col in df1.columns if col.startswith('g-')]
    CELLS = [col for col in df1.columns if col.startswith('c-')]
    cols = GENES + CELLS
    
    qt = QuantileTransformer(n_quantiles=n_quantiles, random_state=0, output_distribution=ouput_distribution)
    df1[cols] = qt.fit_transform(df1[cols])
    df2[cols] = qt.transform(df2[cols])
    return df1, df2


def rank_gauss(train_features, test_features):
    GENES = [col for col in train_features.columns if col.startswith('g-')]
    CELLS = [col for col in train_features.columns if col.startswith('c-')]
    
    for col in (GENES + CELLS):
        transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
        vec_len = len(train_features[col].values)
        vec_len_test = len(test_features[col].values)
        raw_vec = train_features[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)

        train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
    return train_features, test_features

In [None]:
def pre_process_0(train_data, test_data):
    GENES = [col for col in train_data.columns if col.startswith('g-')]
    CELLS = [col for col in train_data.columns if col.startswith('c-')]
    
    def fe_stats(train, test):
        for df in [train, test]:
            df['g_sum'] = df[GENES].sum(axis = 1)
            df['g_mean'] = df[GENES].mean(axis = 1)
            df['g_std'] = df[GENES].std(axis = 1)
            df['g_kurt'] = df[GENES].kurtosis(axis = 1)
            df['g_skew'] = df[GENES].skew(axis = 1)
            df['c_sum'] = df[CELLS].sum(axis = 1)
            df['c_mean'] = df[CELLS].mean(axis = 1)
            df['c_std'] = df[CELLS].std(axis = 1)
            df['c_kurt'] = df[CELLS].kurtosis(axis = 1)
            df['c_skew'] = df[CELLS].skew(axis = 1)
            df['gc_sum'] = df[GENES + CELLS].sum(axis = 1)
            df['gc_mean'] = df[GENES + CELLS].mean(axis = 1)
            df['gc_std'] = df[GENES + CELLS].std(axis = 1)
            df['gc_kurt'] = df[GENES + CELLS].kurtosis(axis = 1)
            df['gc_skew'] = df[GENES + CELLS].skew(axis = 1)
        return train, test

    def c_squared(train, test):
        for df in [train, test]:
            for feature in CELLS:
                df[f'squared_{feature}'] = df[feature] ** 2
        return train, test

    train_data, test_data = fe_stats(train_data, test_data)
    train_data, test_data = c_squared(train_data, test_data)

    return train_data, test_data

In [None]:
def pre_process(train_data, test_data):
    GENES = [col for col in train_data.columns if col.startswith('g-')]
    CELLS = [col for col in train_data.columns if col.startswith('c-')]
    
    # drop no use feature
    train_data = train_data.drop(['cp_type', 'sig_id'], axis=1)
    test_data = test_data.drop(['cp_type', 'sig_id'], axis=1)
    
    # get dummy, one hot encoding
    train_data = pd.get_dummies(train_data, columns=['cp_time', 'cp_dose'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['cp_time', 'cp_dose'], drop_first=True)

    #train_data, test_data = rank_gauss(train_data, test_data)
    train_data, test_data = apply_rankgauss(train_data, test_data, 100, 'normal')
    
    # PCA from kernel https://www.kaggle.com/ragnar123/moa-dnn-feature-engineering
    # this improve CV a little but LB doesn't change
    def create_pca(train, test, colunm, n_components, kind='g'):
        # add pca to train data
        pca = PCA(n_components)
        PCA_data = pca.fit_transform(train[colunm])
        PCA_data = pd.DataFrame(PCA_data, columns=[f'PCA_{kind}-{i}' for i in range(PCA_data.shape[1])])
        train = pd.concat((train, PCA_data), axis=1)
        # train = train.drop(colunm, axis=1)
        
        # add pca to test data
        PCA_data = pca.transform(test[colunm])
        PCA_data = pd.DataFrame(PCA_data, columns=[f'PCA_{kind}-{i}' for i in range(PCA_data.shape[1])])
        test = pd.concat((test, PCA_data), axis=1)
        # test = test.drop(colunm, axis=1)
        return train, test

    train_data, test_data = create_pca(train_data, test_data, GENES, 45, kind='g')
    train_data, test_data = create_pca(train_data, test_data, CELLS, 15, kind='c')

    return train_data, test_data

In [None]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size=512, init_bias=0):
        super(Model, self).__init__()
        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.Linear(num_features, hidden_size)

        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dense2 = nn.Linear(hidden_size, hidden_size // 2)

        self.batch_norm3 = nn.BatchNorm1d(hidden_size // 2)

        self.dense3 = nn.Linear(hidden_size // 2, num_targets)
        self.dense3.bias.data = init_bias

    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = F.relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dense3(x)
        
        return x

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        dct = {
            'x': torch.tensor(self.features[idx, :], dtype=torch.float),
            'y': torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        return dct


class TestDataset:
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        dct = {
            'x': torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0

    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()

    final_loss /= len(dataloader)

    return final_loss

def BCELoss_with_clip(preds, targets, p_min=0.0001, p_max=0.9999):
    loss_fn = nn.BCELoss()
    loss = loss_fn(torch.clamp(preds, p_min, p_max), targets)
    return loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []

    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        
        loss = BCELoss_with_clip(outputs.sigmoid().detach(), targets)
        #loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())

    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)

    return final_loss, valid_preds


def inference_fn(model, dataloader, device):
    model.eval()
    preds = []

    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)

        preds.append(outputs.sigmoid().detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [None]:
def load_pretrain(model):
    pretrained_dict = torch.load('model.pth')
    model_dict = model.state_dict()
    pretrained_dict =  {k: v for k, v in pretrained_dict.items() if k in model_dict} 
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    return model

In [None]:
def run_train(seed, X_train, Y_train, Y_train_smooth, x_test, train_index, valid_index):
    INIT_BIAS = torch.Tensor(np.log(Y_train.values.mean(axis=0)))
    seed_everything(seed)
    x_train, x_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train, y_valid = Y_train_smooth.iloc[train_index], Y_train.iloc[valid_index]

    train_dataset = MoADataset(x_train.values, y_train.values)
    valid_dataset = MoADataset(x_valid.values, y_valid.values)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=x_train.shape[1],
        num_targets=y_train.shape[1],
        hidden_size=HIDDEN_SIZE,
        init_bias=INIT_BIAS
    )

    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.05, div_factor=1e4,
                                              max_lr=0.01, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    loss_fn = nn.BCEWithLogitsLoss()

    oof = np.zeros((X_train.shape[0], Y_train.shape[1]))
    best_loss = np.inf

    for epoch in range(EPOCHS):
        train_loss = train_fn(model, optimizer, scheduler, loss_fn, trainloader, DEVICE)
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        if valid_loss < best_loss:
            best_loss = valid_loss
            oof[valid_index] = valid_preds
            torch.save(model.state_dict(), "model.pth")
            print(f"EPOCH: %d \t LR: %f \t train_loss: %f \t valid_loss: %f, New Best!" % (epoch, scheduler.get_lr()[0], train_loss, valid_loss))
        else:
            print(f"EPOCH: %d \t LR: %f \t train_loss: %f \t valid_loss: %f" % (epoch, scheduler.get_lr()[0], train_loss, valid_loss))

    # --------------------- PREDICTION---------------------
    testdataset = TestDataset(x_test.values)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)

    model = Model(
        num_features=x_test.shape[1],
        num_targets=Y_train.shape[1],
        hidden_size=HIDDEN_SIZE,
        init_bias=INIT_BIAS
    )
    model.load_state_dict(torch.load("model.pth"))
    model.to(DEVICE)
    predictions = inference_fn(model, testloader, DEVICE)

    return oof, predictions

In [None]:
def create_folds(df_y, FOLDS, SEED):
    # LOCATE DRUGS
    vc = df_y.drug_id.value_counts()    
    vc1 = vc.loc[(vc==6)|(vc==12)|(vc==18)].index.sort_values()
    vc2 = vc.loc[(vc!=6)&(vc!=12)&(vc!=18)].index.sort_values()
    
    # TARGETS
    targets = [x for x in df_y.columns if x not in ['sig_id', 'drug_id']]

    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, 
              random_state=SEED)
    tmp = df_y.groupby('drug_id')[targets].mean().loc[vc1]
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, 
              random_state=SEED)
    tmp = df_y.loc[df_y.drug_id.isin(vc2)].reset_index(drop=True)
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    df_y['Fold'] = df_y.drug_id.map(dct1)
    df_y.loc[df_y.Fold.isna(),'Fold'] = df_y.loc[df_y.Fold.isna(),'sig_id'].map(dct2)
    df_y.Fold = df_y.Fold.astype('int8')
    return df_y

In [None]:
def log_loss_numpy(y_pred, y_true):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true == 1, -np.log(y_pred), -np.log(1 - y_pred))
    return loss.mean()

In [None]:
# NEW
def run_k_fold(seed, X_train, Y_train, Y_train_smooth, NFOLDS):
    oof = np.zeros((X_train.shape[0], Y_train.shape[1]))
    predictions = np.zeros((X_test.shape[0], Y_train.shape[1]))

    y_develop = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    x_develop = pd.read_csv('../input/lish-moa/train_features.csv')
    keep_rows = x_develop['cp_type']!='ctl_vehicle'
    y_develop = y_develop[keep_rows]
    
    drugs = pd.read_csv('../input/lish-moa/train_drug.csv')
    y_develop = y_develop.merge(drugs, how='left', on='sig_id')
    y_develop = create_folds(y_develop, NFOLDS, seed)

    for foldno in np.sort(y_develop['Fold'].unique()):
        train_index = y_develop[y_develop['Fold']!=foldno].index
        valid_index = y_develop[y_develop['Fold']==foldno].index
        print(f"\nFold-%d" % (foldno))
        print('Train Sample Size: %d, Validation Sample Size: %d' % (len(train_index), len(valid_index)))
        oof_, pred_ = run_train(seed, X_train, Y_train, Y_train_smooth, X_test, train_index, valid_index)

        predictions += pred_ / NFOLDS
        oof += oof_

    return oof, predictions

In [None]:
def generate_result_with_clip(oof, predictions, origin_Y, Y_train, ctl_idx, use_ctl=False, p_min=0.0001, p_max=0.9999):
    # 使用clip计算cv
    oof = np.clip(oof, p_min, p_max)
    if use_ctl:
        num1 = origin_Y.shape[0] - Y_train.shape[0]
        num2 = Y_train.shape[1]
        ctl_data = np.zeros((num1, num2))
        ctl_pd = pd.DataFrame(data=ctl_data, columns=Y_train.columns)
        Y_train = Y_train.append(ctl_pd, ignore_index=True)
        oof = np.vstack((oof, ctl_data))
    score = log_loss_numpy(oof, Y_train.values)
    print("CV log_loss: %f" % score)

    # generate submission
    # using clip
    predictions = np.clip(predictions, p_min, p_max)
    predictions[ctl_idx] = 0
    submission.iloc[:, 1:] = predictions
    submission.to_csv('submission.csv', index=False)

In [None]:
def optimize_weights(ps, labels):
    if isinstance(ps, list):
        ps = anp.stack(ps)
    
    weights = anp.random.dirichlet([2]*len(ps),size=1).reshape(len(ps)).tolist() + [1]
    L = labels.values

    def log_loss_numpy(y_pred, y_true=L):
        y_true = anp.array(y_true).ravel()
        y_pred = anp.array(y_pred).ravel()
        y_pred = anp.clip(y_pred, 1e-15, 1 - 1e-15)
        loss = anp.where(y_true == 1, -anp.log(y_pred), -anp.log(1 - y_pred))
        return loss.mean()


    def individual_log_loss(ps):
        for i, p  in enumerate(ps):
            print(f'Log Loss of M%d: %.7f' % (i, log_loss_numpy(p)))


    def calc_oof_blend(ws, ps):
        return anp.squeeze(anp.matmul(ws.reshape(1, 1, len(ws)), anp.transpose(ps, [1, 0, 2])))


    def Lagrange_func(params):
        ws = params[:-1]
        _lambda = params[-1]
        ws = anp.array(ws)
        oof_blend = calc_oof_blend(ws, ps)
        return log_loss_numpy(oof_blend) - _lambda * (ws.sum() - 1.)


    def Lagrange_obj(params):
        ws = params[:-1]
        grad_L = grad(Lagrange_func)
        pars = grad_L(params)
        dLdws = pars[:-1]
        # dldlam = pars[-1]
        res = anp.append(dLdws, sum(ws) - 1.)
        return res


    individual_log_loss(ps)
    start_time = time()
    pars = fsolve(Lagrange_obj, weights)
    ws = pars[:-1]
    time_elapsed = time() - start_time
    print(f'Optimized in %.2fs' % time_elapsed)
    print('Optimized Weights:', ws)
    oof_b = calc_oof_blend(ws, ps)
    optimized_cv = log_loss_numpy(oof_b)
    print('Optimised Blend OOF Score:', optimized_cv)
    return ws, optimized_cv

In [None]:
# hyper param
BATCH_SIZE = 128
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
HIDDEN_SIZE = 512
EPOCHS = 50
NFOLDS = 7
SEED = [14, 16, 77, 1984, 42]

In [None]:
X_train, X_test, Y_pretrain, Y_train, Y_pretrain_smooth, Y_train_smooth, submission, origin_Y = load_data()
ctl_idx = X_test[X_test['cp_type'] == 'ctl_vehicle'].index
X_train, X_test = pre_process_0(X_train, X_test)
X_train, X_test = pre_process(X_train, X_test)

In [None]:
oof = np.empty(shape=(len(SEED), Y_train.shape[0], Y_train.shape[1]))
predictions = np.empty(shape=(len(SEED), X_test.shape[0], Y_train.shape[1]))
start_time = time()
for i, seed in enumerate(SEED):
    print(f'\n\nSeed-%d (%d)' % (i, seed))
    oof_, predictions_ = run_k_fold(seed, X_train, Y_train, Y_train_smooth, NFOLDS)
    oof[i, :, :] = oof_
    predictions[i, :, :] = predictions_
    
np.save('Val_pred.npy', oof)
ws, optimized_cv = optimize_weights(oof, Y_train)
predictions = (predictions.transpose(1, 2, 0) * ws.reshape(-1, len(ws))).sum(axis=-1)
oof = (oof.transpose(1, 2, 0) * ws.reshape(-1, len(ws))).sum(axis=-1)

end_time = time()
print("total time consume:", end_time - start_time)

In [None]:
generate_result_with_clip(oof, predictions, origin_Y, Y_train, ctl_idx, use_ctl=False)