# Libraries

In [None]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import pickle
import os, sys
import gc
import math
import random
from tqdm import tqdm
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.decomposition import PCA
import umap

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss

from tqdm import tqdm

import math

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

# Load data

In [None]:
N_STARTS = 2
N_SPLITS = 4
SEED = 217
VAR_THRESHOLD = 0.8
NO_CTL = True
N_COMPONENTS = [80, 10]
POSTPROCESS = True

In [None]:
%%time
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

if NO_CTL:
    print('no ctl')
    train_features = train_features[train_features['cp_type']!='ctl_vehicle']
    control_g = test_features['cp_type'] == 'ctl_vehicle'
    test_g = test_features['cp_type'] != 'ctl_vehicle'
    test_features = test_features[test_g]
    train_targets = train_targets.iloc[train_features.index]
    train_features.reset_index(drop=True, inplace=True)
    test_features.reset_index(drop=True, inplace=True)
    train_targets.reset_index(drop=True, inplace=True)
    
ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
# variance threshold
data_all = pd.concat([train_features, test_features], ignore_index=True)
cols_numeric = [feat for feat in list(data_all.columns) if feat not in ['sig_id', 'cp_type', 'cp_time', 'cp_dose']]
mask = (data_all[cols_numeric].var() >= VAR_THRESHOLD).values
tmp = data_all[cols_numeric].loc[:, mask]
data_all = pd.concat([data_all[['sig_id', 'cp_type', 'cp_time', 'cp_dose']], tmp], axis=1)

In [None]:
def preprocess(df):
    df = df.copy()
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: 0, 48: 0.5, 72: 1})
#     df = pd.get_dummies(df, columns=['cp_type', 'cp_time','cp_dose'])
    del df['sig_id']
    return df

data_all = preprocess(data_all)

del train_targets['sig_id']

In [None]:
# categorize feats
g_feats = [f for f in data_all.columns.values.tolist() if 'g-' in f]
c_feats = [f for f in data_all.columns.values.tolist() if 'c-' in f]
cp_feats = [f for f in data_all.columns.values.tolist() if 'cp_' in f]
print(len(g_feats), len(c_feats), len(cp_feats))

In [None]:
# feature engineering
def q1(x):
    return x.quantile(0.2)

def q2(x):
    return x.quantile(0.8)

def calc_stats(df):
    for stats in tqdm(['sum', 'mean', 'std', 'kurt', 'skew', 'max', 'min']):
        df['g-'+stats] = getattr(df[g_feats], stats)(axis=1)
        df['c-'+stats] = getattr(df[c_feats], stats)(axis=1)
        df['gc-'+stats] = getattr(df[g_feats+c_feats], stats)(axis=1)
    return df

data_all = calc_stats(data_all)

In [None]:
%%time

# dimensionality reduction
def dim_reducer(feats, n_components=N_COMPONENTS):
    trans = PCA(n_components=n_components)
    train_dist = trans.fit_transform(data_all[feats].values)
    
    return train_dist

train_g = dim_reducer(g_feats, n_components=N_COMPONENTS[0])
train_c = dim_reducer(c_feats, n_components=N_COMPONENTS[1])

for i in range(train_g.shape[1]):
    data_all[f'g-pca{i+1}'] = train_g[:, i]
for i in range(train_c.shape[1]):
    data_all[f'c-pca{i+1}'] = train_c[:, i]

In [None]:
train = data_all.iloc[:len(train_features)]
print(train.shape)
train.head()

In [None]:
test = data_all.iloc[len(train_features):]
print(test.shape)
test.head()

In [None]:
print(train_targets.shape)
train_targets.tail()

In [None]:
del train_features, test_features, data_all
gc.collect()

In [None]:
feats = test.columns.values.tolist()
drops = ['sig_id', 'cp_type']
feats = [f for f in feats if f not in drops]

print('{:,} features'.format(len(feats)))
print(feats)

In [None]:
p_min = 0.001
p_max = 0.999

def metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels=[0,1]))
    return np.mean(metrics)

In [None]:
params = {
    'n_estimators': 24000,
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'max_depth': 3,
    'learning_rate': 0.08,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.4,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'seed': SEED,
    'early_stopping_rounds': 40,
    }    
params["metric"] = "binary_logloss" # other candidates: binary_logloss
# params["is_unbalance"] = True # assume unbalanced data

def fit_lgb_kfold(train, train_targets, test, features, target, n_splits=N_SPLITS, random_state=SEED):    
    oof = np.zeros(train.shape[0])
    y_preds = np.zeros(test.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    params['seed'] = SEED * (random_state+1)
    
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for n, (train_idx, valid_idx) in enumerate(cv.split(train, train_targets[target])):
        # train test split
        x_train_train = train[features].iloc[train_idx]
        y_train_train = train_targets[target].iloc[train_idx]
        x_train_valid = train[features].iloc[valid_idx]
        y_train_valid = train_targets[target].iloc[valid_idx]

        # lgb dataset
        lgb_train = lgb.Dataset(data=x_train_train, label=y_train_train)
        lgb_valid = lgb.Dataset(data=x_train_valid, label=y_train_valid)

        # fit
        model = lgb.train(params, lgb_train, valid_sets=lgb_valid, verbose_eval=0)
        fi['importance'] += model.feature_importance(importance_type="gain") / N_SPLITS
        
        # save mod?p(model, open(f'model_{random_state}_{n}_{target}.pkl', 'wb'))                
    
        # predict
        oof[valid_idx] = model.predict(x_train_valid, num_iteration=model.best_iteration)
        y_preds += model.predict(test[features]) / N_SPLITS
        
    score = log_loss(train_targets[target], oof)
    print('LogLoss Score:', score)

#     model = pickle.load(open(f'model_{seed}_{n}_{targ}.pkl', 'rb'))
    return y_preds, oof, score

In [None]:
res = train_targets.copy()
ss.loc[:, train_targets.columns] = 0
res.loc[:, train_targets.columns] = 0

for seed in range(N_STARTS):
    res_seed = res.copy()
    ss_seed = ss.copy()
    for targ in tqdm(train_targets.columns):
        print('Target = {}'.format(targ))
        y_pred, oof, score = fit_lgb_kfold(train, train_targets, test, feats, targ, n_splits=N_SPLITS, random_state=seed)
        res_seed[targ] = oof
        ss_seed.loc[test_g, targ] = y_pred
    
    print(f'OOF Metric For SEED {seed}: {metric(train_targets, res_seed)}')
    for targ in train_targets.columns:
        res[targ] += res_seed[targ].values / N_STARTS
        ss.loc[test_g, targ] += ss_seed.loc[test_g, targ].values / N_STARTS

In [None]:
# if DO == 'training':
#     print(f'OOF Metric: {metric(train_targets, res)}')
    
# elif DO == 'inference':
print(f'OOF Metric: {metric(train_targets, res)}')

if POSTPROCESS:
    print('post-process...')

    # clip
    ss.iloc[:,1:] = np.clip(ss.values[:, 1:], p_min, p_max)

    # Set ctl_vehicle to 0
    ss.iloc[control_g, 1:] = 0
ss.to_csv('submission.csv', index=False)

Kernel still under modification.. <span style='color:red'>**Feedback**</span> is also very much appreciated.
Pls <span style='color:red'>**UPVOTE**</span>, if you find it useful. 
