# Libraries

In [None]:
# import sys
# sys.path.append('../input/iterative-stratification/iterative-stratification-master')
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import pickle
import joblib
import os, sys
import gc
import math
import random
from tqdm import tqdm
# from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.decomposition import PCA
import umap

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss

from tqdm import tqdm

import math

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

# Load data

In [None]:
N_STARTS = 1
N_SPLITS = 4
SEED = 217
POSTPROCESS = True

In [None]:
def preprocessing_lgb():
    VAR_THRESHOLD = 0.7
    ncompo_genes = 80
    ncompo_cells = 10
    seed = 42
    
    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')

    train_features = train_features[train_features['cp_type']!='ctl_vehicle']
    control_g = test_features['cp_type'] == 'ctl_vehicle'
    test_g = test_features['cp_type'] != 'ctl_vehicle'
    test_features = test_features[test_g]
    train_targets = train_targets.iloc[train_features.index]

    # drop cp_type
    train_features = train_features.drop('cp_type', axis=1).reset_index(drop=True)
    test_features = test_features.drop('cp_type', axis=1).reset_index(drop=True)
    train_targets.reset_index(drop=True, inplace=True)
    ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

    # Variance 
    cols_numeric = [feat for feat in list(train_features.columns) if feat not in ['sig_id', 'cp_time', 'cp_dose']]
    mask = (train_features[cols_numeric].var() > VAR_THRESHOLD).values
    tmp = train_features[cols_numeric].loc[:, mask]
    train_features = pd.concat([train_features[['sig_id', 'cp_time', 'cp_dose']], tmp], axis=1)
    cols_numeric = [feat for feat in list(train_features.columns) if feat not in ['sig_id', 'cp_time', 'cp_dose']]
    test_features = pd.concat([test_features[['sig_id', 'cp_time', 'cp_dose']], test_features[cols_numeric]], axis=1)

    GENES = [col for col in train_features.columns if col.startswith('g-')]
    CELLS = [col for col in train_features.columns if col.startswith('c-')]

    # Rank Gauss
    for col in (GENES + CELLS):
        transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")   # from optimal commit 9
        vec_len = len(train_features[col].values)
        vec_len_test = len(test_features[col].values)
        raw_vec = train_features[col].values.reshape(vec_len, 1)
        transformer.fit(raw_vec)
        
        train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
        test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]


    # PCA
    pca_genes = PCA(n_components = ncompo_genes, random_state = seed)
    pca_genes_train = pca_genes.fit_transform(train_features[GENES])
        
    pca_cells = PCA(n_components = ncompo_cells, random_state = seed)
    pca_cells_train = pca_cells.fit_transform(train_features[CELLS])
    
    pca_genes_train = pd.DataFrame(pca_genes_train, columns = [f"pca_g-{i}" for i in range(ncompo_genes)])
    pca_cells_train = pd.DataFrame(pca_cells_train, columns = [f"pca_c-{i}" for i in range(ncompo_cells)])
    train_features = pd.concat([train_features, pca_genes_train, pca_cells_train], axis = 1)
    
    pca_genes_test = pca_genes.transform(test_features[GENES])
    pca_cells_test = pca_cells.transform(test_features[CELLS])
    
    pca_genes_test = pd.DataFrame(pca_genes_test, columns = [f"pca_g-{i}" for i in range(ncompo_genes)])
    pca_cells_test = pd.DataFrame(pca_cells_test, columns = [f"pca_c-{i}" for i in range(ncompo_cells)])
    test_features = pd.concat([test_features, pca_genes_test, pca_cells_test], axis = 1)

    # One hot
    train_features = pd.get_dummies(train_features, columns = ['cp_time', 'cp_dose'])
    test_features = pd.get_dummies(test_features, columns = ['cp_time', 'cp_dose'])

    # Extra features
    for df in [train_features, test_features]:
        for stats in tqdm(['sum', 'mean', 'std', 'kurt', 'skew', 'max', 'min']):
            df['g-'+stats] = getattr(df[GENES], stats)(axis=1)
            df['c-'+stats] = getattr(df[CELLS], stats)(axis=1)
            df['gc-'+stats] = getattr(df[GENES+CELLS], stats)(axis=1)
    train_targets.drop('sig_id', axis=1, inplace=True)
    train = train_features.drop('sig_id', axis=1)
    test = test_features.drop('sig_id', axis=1)
    feats = train.columns.to_list()
    return train, test, train_targets, ss, control_g, test_g, feats

train, test, train_targets, ss, control_g, test_g, feats = preprocessing_lgb()

In [None]:
p_min = 0.001
p_max = 0.999

def metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels=[0,1]))
    return np.mean(metrics)

In [None]:
params = {
    'n_estimators': 24000,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'max_depth': 3,
    'learning_rate': 0.08,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.4,
    'lambda_l1': 1,
    'lambda_l2': 1,
#     'seed': SEED,
    'early_stopping_rounds': 40,
    }    

def fit_lgb_kfold(train, train_targets, test, features, target, n_splits=N_SPLITS, random_state=SEED):    
    oof = np.zeros(train.shape[0])
    y_preds = np.zeros(test.shape[0])
#     fi = pd.DataFrame()
#     fi['features'] = features
#     fi['importance'] = 0
    params['seed'] = SEED * (random_state+1)
    
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for fold_n, (train_idx, valid_idx) in enumerate(cv.split(train, train_targets[target])):
        # train test split
        x_train_train = train[features].iloc[train_idx]
        y_train_train = train_targets[target].iloc[train_idx]
        x_train_valid = train[features].iloc[valid_idx]
        y_train_valid = train_targets[target].iloc[valid_idx]

        # lgb dataset
        lgb_train = lgb.Dataset(data=x_train_train, label=y_train_train)
        lgb_valid = lgb.Dataset(data=x_train_valid, label=y_train_valid)

        # fit
        model = lgb.train(params, lgb_train, valid_sets=lgb_valid, verbose_eval=0)
#         fi['importance'] += model.feature_importance(importance_type="gain") / N_SPLITS

        joblib.dump(model, f'model_{target}_seed_{SEED*(random_state+1)}_fold_{fold_n}.pkl')
        model = joblib.load(f'model_{target}_seed_{SEED*(random_state+1)}_fold_{fold_n}.pkl')
        
        # save mod?p(model, open(f'model_{random_state}_{n}_{target}.pkl', 'wb'))                
    
        # predict
        oof[valid_idx] = model.predict(x_train_valid, num_iteration=model.best_iteration)
        y_preds += model.predict(test[features]) / N_SPLITS
        
    score = log_loss(train_targets[target], oof)
    print('LogLoss Score:', score)

#     model = pickle.load(open(f'model_{seed}_{n}_{targ}.pkl', 'rb'))
    return y_preds, oof, score

In [None]:
res = train_targets.copy()
ss.loc[:, train_targets.columns] = 0
res.loc[:, train_targets.columns] = 0

for seed in range(N_STARTS):
    res_seed = res.copy()
    ss_seed = ss.copy()
    print('Seed:', SEED*seed)
    for targ in tqdm(train_targets.columns):
        print('Target = {}'.format(targ))
        y_pred, oof, score = fit_lgb_kfold(train, train_targets, test, feats, targ, n_splits=N_SPLITS, random_state=seed)
        res_seed[targ] = oof
        ss_seed.loc[test_g, targ] = y_pred
    
    print(f'OOF Metric For SEED {seed}: {metric(train_targets, res_seed)}')
    for targ in train_targets.columns:
        res[targ] += res_seed[targ].values / N_STARTS
        ss.loc[test_g, targ] += ss_seed.loc[test_g, targ].values / N_STARTS

In [None]:
# if DO == 'training':
#     print(f'OOF Metric: {metric(train_targets, res)}')
    
# elif DO == 'inference':
print(f'OOF Metric: {metric(train_targets, res)}')

if POSTPROCESS:
    print('post-process...')

    # clip
    ss.iloc[:,1:] = np.clip(ss.values[:, 1:], p_min, p_max)

    # Set ctl_vehicle to 0
    ss.iloc[control_g, 1:] = 0
ss.to_csv('submission.csv', index=False)

In [None]:
ss

Kernel still under modification.. <span style='color:red'>**Feedback**</span> is also very much appreciated.
Pls <span style='color:red'>**UPVOTE**</span>, if you find it useful. 
