In [None]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge, Lasso, RidgeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from collections import Counter
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.impute import MissingIndicator
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from sklearn.inspection import permutation_importance
from category_encoders import TargetEncoder, LeaveOneOutEncoder
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer
from functools import reduce
from sklearn.linear_model import LogisticRegression
import optuna
from optuna.samplers import RandomSampler, GridSampler, TPESampler
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neural_network import MLPClassifier

## Introduction

This notebook will show you how to stack your neural network out of sample (oof) outputs with meta features. Three models will be applied to the stacking data and you can combine them at the end: Ridge, Multilayer Perceptron and LightGBM. You can compare this approach with directly using meta in your neural network, or even blend both approaches to get a more robust model. Be aware of data leakage. 

Input files: out of sample and test predictions are obtained using Chris Deotte's kernel https://www.kaggle.com/cdeotte/triple-stratified-kfold-with-tfrecords The files and parameters in this notebook are only for ***demonstration purposes***. You may want to use your own.

You can certainly apply more trails to find better parameters or experiment with target mean encoding, binning your numeric outputs etc. Please be
aware of overfitting. This notebook gives the opportunities to explore these options. It also allows you to explore different
imputing strategies: mean, constant etc, as well as different binning strategies, kmeans, uniform, quantile etc. This allows you to
build models based on different datasets to improve robustness.

Focal loss for LightGBM is obtained at: https://github.com/jhwjhw0123/Imbalance-XGBoost/blob/master/imxgboost/focal_loss.py

If you find any bugs, please let me know. Please upvote if you find this notebook helpful and I really appreciate your support.


In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
seed_everything(0)

## Load dataset

In [None]:
dftrain = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv') 
dftest = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv') 

oof1 = pd.read_csv('../input/stacking-files/oof1.csv')
sub1 = pd.read_csv('../input/stacking-files/submission1.csv')
oof1 = oof1[['image_name','pred']]
oof1.columns = ['image_name','pred1']
sub1.columns = ['image_name','pred1']

oof2 = pd.read_csv('../input/stacking-files/oof2.csv')
sub2 = pd.read_csv('../input/stacking-files/submission2.csv')
oof2 = oof2[['image_name','pred']]
oof2.columns = ['image_name','pred2']
sub2.columns = ['image_name','pred2']
print(oof1.shape)

In [None]:
train_df = dftrain[['image_name','sex','age_approx','anatom_site_general_challenge','target']]
test_df = dftest[['image_name','sex','age_approx','anatom_site_general_challenge']]
train_df.columns = ['image_name','sex','age','site','target']
test_df.columns = ['image_name','sex','age','site']

In [None]:
oof_frames = [oof1,oof2]
oof = reduce(lambda  left,right: pd.merge(left,right,on=['image_name'],
                                            how='outer'), oof_frames)
oof = oof.reset_index(drop=True)
sub_frames = [sub1,sub2]
sub = reduce(lambda  left,right: pd.merge(left,right,on=['image_name'],
                                            how='outer'), sub_frames)
sub = sub.reset_index(drop=True)

In [None]:
X_train = pd.merge(oof,train_df,on='image_name',how='outer').reset_index(drop=True)
X_test = pd.merge(sub,test_df,on='image_name',how='outer').reset_index(drop=True)
X_train = X_train.dropna(axis=0, how='any', thresh=None, subset=['pred1','pred2'])
y_train = X_train['target']
image_names = X_train['image_name'] 
X_train = X_train[['pred1','pred2','sex','age','site']].reset_index(drop=True)
X_test = X_test[['pred1','pred2','sex','age','site']].reset_index(drop=True)

In [None]:
# Loss function for LightGBM
def focal_binary_lgb(label, pred):
    def robust_pow(num_base, num_pow):
        # numpy does not permit negative numbers to fractional power
        # use this to perform the power algorithmic
        return np.sign(num_base) * (np.abs(num_base)) ** (num_pow)
    
    gamma_indct = 2.0
    # retrieve data from dtrain matrix
    #label = dtrain.label
    # compute the prediction with sigmoid
    sigmoid_pred = 1.0 / (1.0 + np.exp(-pred))
    # gradient
    # complex gradient with different parts
    g1 = sigmoid_pred * (1 - sigmoid_pred)
    g2 = label + ((-1) ** label) * sigmoid_pred
    g3 = sigmoid_pred + label - 1
    g4 = 1 - label - ((-1) ** label) * sigmoid_pred
    g5 = label + ((-1) ** label) * sigmoid_pred
    # combine the gradient
    grad = gamma_indct * g3 * robust_pow(g2, gamma_indct) * np.log(g4 + 1e-9) + \
           ((-1) ** label) * robust_pow(g5, (gamma_indct + 1))
    # combine the gradient parts to get hessian components
    hess_1 = robust_pow(g2, gamma_indct) + \
             gamma_indct * ((-1) ** label) * g3 * robust_pow(g2, (gamma_indct - 1))
    hess_2 = ((-1) ** label) * g3 * robust_pow(g2, gamma_indct) / g4
    # get the final 2nd order derivative
    hess = ((hess_1 * np.log(g4 + 1e-9) - hess_2) * gamma_indct +
            (gamma_indct + 1) * robust_pow(g5, gamma_indct)) * g1

    return grad, hess

## Define your feature space

This create the pipeline to create different feature sets for different model to improve the robustness
of the ensemble

In [None]:
def get_model_Combined(info,model):
    
    nontarget_features = info['nontarget_features']
    target_features = info['target_features']
    nonbin_features = info['nonbin_features']
    bin_features = info['bin_features']
    simple_strategy = info['simple_strategy']
    kbin_strategy = info['kbin_strategy']
    
    if simple_strategy == 'mean':
        simple_imputer = SimpleImputer(strategy='mean')
    elif simple_strategy == 'median':
        simple_imputer = SimpleImputer(strategy='median')
    else:
        simple_imputer = SimpleImputer(strategy='constant',fill_value=-1)
    
    bin_encode_strategy = kbin_strategy[0]
    bin_strategy = kbin_strategy[1]
    if kbin_strategy[2] == 'mean':
        bin_imputer = SimpleImputer(strategy='mean')
    else:
        bin_imputer = SimpleImputer(strategy='median')
    
    numeric_transformer1 = FeatureUnion(transformer_list=[
            ('imputer',simple_imputer),
            ('indicator',MissingIndicator())
            ])
    numeric_transformer2 = Pipeline(steps=[
            ('imputer', bin_imputer),
            ('Bin',KBinsDiscretizer(n_bins=5, encode=bin_encode_strategy, strategy=bin_strategy)),
            ])
    ce_target = LeaveOneOutEncoder(cols=target_features,sigma=0.1,random_state=0)
    ce_cat = ce.OneHotEncoder(cols=nontarget_features,handle_unknown='value',handle_missing='value')
    ct1 = ColumnTransformer(
            transformers=[
                ('cat_onehot',ce_cat,nontarget_features),
                ('cat_target',ce_target,target_features),
                ('num_nobin',numeric_transformer1,nonbin_features),
                ('num_bin',numeric_transformer2,bin_features),
                ],remainder = 'drop')
    clf_model = Pipeline(steps=[('preprocessor', ct1),
                          ('classifier', model)])
    return clf_model

I am only using the following feature engineering for **demonstration purposes**.

You can apply target mean encoding and kmeans discretizer 
directly using this code. Play around it with it and see
what features work best with your stacking features

Be careful with overfitting when using target mean encoding

The order for kbin_strategy is: encode: 'onehot','onehot-dense','ordinal'; strategy: 'uniform', 'quantile', 'kmeans'; I also add an imputer
to work with KBinDiscretizer. (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html)


In [None]:
features_RID = {}
features_RID['nontarget_features'] = ['site']
features_RID['target_features'] = ['sex']
features_RID['nonbin_features'] = ['pred1','pred2','age']
features_RID['bin_features'] = ['age']
features_RID['simple_strategy'] = 'mean'
features_RID['kbin_strategy'] = ['onehot','uniform','mean']

features_LGB = {}
features_LGB['nontarget_features'] = ['sex','site']
features_LGB['target_features'] = []
features_LGB['nonbin_features'] = ['pred1','pred2','age']
features_LGB['bin_features'] = ['pred1']
features_LGB['simple_strategy'] = 'constant'
features_LGB['kbin_strategy'] = ['ordinal','uniform','median']

features_NN = {}
features_NN['nontarget_features'] = ['sex','site']
features_NN['target_features'] = []
features_NN['nonbin_features'] = ['pred1','pred2','age']
features_NN['bin_features'] = ['pred2']
features_NN['simple_strategy'] = 'median'
features_NN['kbin_strategy'] = ['ordinal','uniform','median']

## Train Ridge/MLP/LGB with Optuna

### TRAIN RIDGE

In [None]:
study_name1 = 'Ridge'
study_ridge = optuna.create_study(study_name=study_name1,direction='maximize',sampler=TPESampler(0))

In [None]:
def opt_ridge(trial):    

    C = trial.suggest_loguniform('alpha',1e-7,10)
    b = int(trial.suggest_loguniform('b',1,32))
    kFold= StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    model = RidgeClassifier(alpha=C, class_weight={0:1,1:b},random_state=0)
    clf = get_model_Combined(features_RID,model)
    kFold= StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    scoring = 'roc_auc'
    return cross_val_score(
        clf, X_train, y_train, n_jobs=-1,scoring=scoring,cv=kFold).mean()

In [None]:
study_ridge.optimize(opt_ridge, n_trials=50)

In [None]:
print('Total number of trials: ',len(study_ridge.trials))
trial_ridge = study_ridge.best_trial
print('Best score : {}'.format(-trial_ridge.value))
for key, value in trial_ridge.params.items():
    print("    {}: {}".format(key, value))
alpha_RID = list(trial_ridge.params.items())[0][1]
b = int(list(trial_ridge.params.items())[1][1])

### Train LightGBM

In [None]:
study_name2 = 'lgb'
study_lgb = optuna.create_study(study_name=study_name2,direction='maximize',sampler=TPESampler(0))

In [None]:
def opt_lgb(trial):    

    num_leaves = int(trial.suggest_loguniform("num_leaves", 3,20))
    subsample =  trial.suggest_discrete_uniform('bfrac',0.5,1.0,q=0.05),
    subsample_freq = int(trial.suggest_discrete_uniform('bfreq',1,5,q=1.0)),
    colsample_bytree = trial.suggest_discrete_uniform('feature',0.5,1.0,q=0.05),
    reg_lambda = trial.suggest_loguniform("lambda_l2", 1e-7, 10)
    kFold= StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    model = lgb.LGBMClassifier(objective=focal_binary_lgb,random_state=0,
                                                        num_leaves = num_leaves,
                                                         subsample=subsample,subsample_freq=subsample_freq,
                                                        colsample_bytree=colsample_bytree,reg_lambda=reg_lambda)
    clf = get_model_Combined(features_LGB,model)
    scoring = 'roc_auc'
    return cross_val_score(
        clf, X_train, y_train, n_jobs=-1,scoring=scoring,cv=kFold).mean()


In [None]:
study_lgb.optimize(opt_lgb, n_trials=50)

In [None]:
print('Total number of trials: ',len(study_lgb.trials))
trial_lgb = study_lgb.best_trial
print('Best score : {}'.format(-trial_lgb.value))
for key, value in trial_lgb.params.items():
    print("    {}: {}".format(key, value))

In [None]:
num_leaves = int(list(trial_lgb.params.items())[0][1])
bfrac = list(trial_lgb.params.items())[1][1]
bfreq = int(list(trial_lgb.params.items())[2][1])
feature =  list(trial_lgb.params.items())[3][1]
lambda_l2 = list(trial_lgb.params.items())[4][1]

### Train NN

I only trained 10 trails. You can increase number of trails also
change the parameter range values to get better results

In [None]:
study_name3 = 'nn'
study_nn = optuna.create_study(study_name=study_name3,direction='maximize',sampler=TPESampler(0))

In [None]:
def opt_nn(trial):    

    alpha = trial.suggest_loguniform('alpha',1e-6,10)
    z = int(trial.suggest_loguniform('z',4,32))
    kFold= StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    model= MLPClassifier(hidden_layer_sizes=[z],alpha=alpha,random_state=0,tol=1e-4,max_iter=200)
    clf = get_model_Combined(features_NN,model)
    scoring = 'roc_auc'
    return cross_val_score(
        clf, X_train, y_train, n_jobs=-1,scoring=scoring,cv=kFold).mean()

In [None]:
study_nn.optimize(opt_nn, n_trials=10)

In [None]:
print('Total number of trials: ',len(study_nn.trials))
trial_nn = study_nn.best_trial
print('Best score : {}'.format(-trial_nn.value))
for key, value in trial_nn.params.items():
    print("    {}: {}".format(key, value))

In [None]:
alpha_nn = list(trial_nn.params.items())[0][1]
z = int(list(trial_nn.params.items())[1][1])

## Obtain OOF Score

In [None]:
model_final_Ridge = RidgeClassifier(alpha=alpha_RID,class_weight={0:1,1:b},random_state=0)
model_final_LGB = lgb.LGBMClassifier(objective=focal_binary_lgb,random_state=0,
                                                        num_leaves = num_leaves,
                                                         subsample=bfrac,subsample_freq=bfreq,
                                                        colsample_bytree=feature,reg_lambda=lambda_l2)
model_final_NN = MLPClassifier(hidden_layer_sizes=[z],alpha=alpha_nn,random_state=0,tol=1e-4,max_iter=200)

In [None]:
def cv_training(train_data,y_train_data,info,model):
    kFold= StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
    oof_pred = []
    oof_target = []
    scores = []
    oof_images = []
    y_pred_test = 0.0
    for fold, (trn_idx, val_idx) in enumerate(kFold.split(train_data,y_train_data)):
        print('Fold: ',fold)
        X_train_cv = train_data.iloc[trn_idx]
        X_val_cv = train_data.iloc[val_idx]
        y_train_cv = y_train_data.iloc[trn_idx]
        y_val_cv = y_train_data.iloc[val_idx]
        images_val = image_names.iloc[val_idx]
        ct = get_model_Combined(info,model)
        X_train_cv_temp = ct.named_steps['preprocessor'].fit_transform(X_train_cv,y_train_cv)
        X_val_cv_temp = ct.named_steps['preprocessor'].transform(X_val_cv)
        X_test_temp = ct.named_steps['preprocessor'].transform(X_test)
        clf_sigmoid_temp = CalibratedClassifierCV(model, cv=kFold, method='sigmoid')
        clf_sigmoid_temp.fit(X_train_cv_temp, y_train_cv)
        y_pred = clf_sigmoid_temp.predict_proba(X_val_cv_temp)[:,1]
        y_pred_test = y_pred_test + clf_sigmoid_temp.predict_proba(X_test_temp)[:,1]
        score_temp = roc_auc_score(y_val_cv,y_pred)
        scores.append(score_temp)
        oof_pred.append(y_pred)
        oof_target.append(y_val_cv)
        oof_images.append(images_val)
        print(score_temp)
    y_pred_test = y_pred_test / 5
    oof_pred = np.concatenate((oof_pred[0],oof_pred[1],oof_pred[2],
                               oof_pred[3],oof_pred[4]))
    oof_target = np.concatenate((oof_target[0],oof_target[1],oof_target[2],
                                 oof_target[3],oof_target[4]))
    oof_images = np.concatenate((oof_images[0],oof_images[1],oof_images[2],
                               oof_images[3],oof_images[4]))
    oof_df = pd.DataFrame({'image_name':oof_images,'predictions':oof_pred,'target':oof_target})
    return scores, oof_df, y_pred_test

In [None]:
RID_scores,RID_oof_df,y_pred_test_RID = cv_training(X_train,y_train,features_RID,model_final_Ridge)

In [None]:
LGB_scores,LGB_oof_df,y_pred_test_LGB = cv_training(X_train,y_train,features_LGB,model_final_LGB)

In [None]:
NN_scores,NN_oof_df,y_pred_test_NN = cv_training(X_train,y_train,features_NN,model_final_NN)

In [None]:
print('Overall auc RID: ',roc_auc_score(RID_oof_df['target'],RID_oof_df['predictions']))
print('Overall auc LGB: ',roc_auc_score(LGB_oof_df['target'],LGB_oof_df['predictions']))
print('Overall auc NN: ',roc_auc_score(NN_oof_df['target'],NN_oof_df['predictions']))


In [None]:
RID_oof_df.to_csv('RID_oof_df.csv',index=False)
LGB_oof_df.to_csv('LGB_oof_df.csv',index=False)
NN_oof_df.to_csv('NN_oof_df.csv',index=False)

## Make Predictions

In [None]:
# Do probability calibration for all models
# https://scikit-learn.org/stable/modules/calibration.html
kFold= StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
clf_sigmoid_Ridge = CalibratedClassifierCV(model_final_Ridge, cv=kFold, method="sigmoid")
clf_sigmoid_LGB = CalibratedClassifierCV(model_final_LGB, cv=kFold, method="sigmoid")
clf_sigmoid_NN = CalibratedClassifierCV(model_final_NN, cv=kFold, method="sigmoid") 

In [None]:
# Transform training and testing datasets based on each model's transformer(preprocessor)
ct_RID = get_model_Combined(features_RID,model_final_Ridge)
X_train_RID = ct_RID.named_steps['preprocessor'].fit_transform(X_train,y_train)
X_test_RID = ct_RID.named_steps['preprocessor'].transform(X_test)

ct_LGB = get_model_Combined(features_LGB,model_final_LGB)
X_train_LGB = ct_LGB.named_steps['preprocessor'].fit_transform(X_train,y_train)
X_test_LGB = ct_LGB.named_steps['preprocessor'].transform(X_test)

ct_NN = get_model_Combined(features_NN,model_final_NN)
X_train_NN = ct_NN.named_steps['preprocessor'].fit_transform(X_train,y_train)
X_test_NN = ct_NN.named_steps['preprocessor'].transform(X_test)

In [None]:
# Fit model to it's own training set
clf_sigmoid_Ridge.fit(X_train_RID, y_train)
clf_sigmoid_LGB.fit(X_train_LGB, y_train)
clf_sigmoid_NN.fit(X_train_NN, y_train)

In [None]:
# Make Predictions
y_pred_Ridge = clf_sigmoid_Ridge.predict_proba(X_test_RID)[:,1]
y_pred_LGB = clf_sigmoid_LGB.predict_proba(X_test_LGB)[:,1]
y_pred_NN = clf_sigmoid_NN.predict_proba(X_test_NN)[:,1]

In [None]:
# Here is just a simple average, you can choose your own weights 
submission = pd.read_csv('../input/siim-isic-melanoma-classification/sample_submission.csv')
submission['target'] = 0.4 * y_pred_Ridge + 0.4 * y_pred_LGB + 0.2 * y_pred_NN

In [None]:
plt.hist(submission['target'])

In [None]:
submission.to_csv('submission.csv',index=False)
submission.head()

If you find this notebook helpful, you may also want to check my other notebooks:

Feature Selection: https://www.kaggle.com/chen2222/feature-selection-mdi-perm-rfe-in-depth-review

LightGBM Tuning:https://www.kaggle.com/chen2222/lightgbm-tuning-step-by-step-optuna-0-122-lb

Simple Blending: https://www.kaggle.com/chen2222/ridge-lgb-nn-on-meta-data-optuna-focal-loss