__Imbalanced binary classification - employee attrition__

1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [EDA](#EDA)
    1. [Categorical feature EDA](#Categorical-feature-EDA)
    1. [Numeric feature EDA](#numeric-feature-EDA)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    
1. [Data preparation](#Data-preparation)
    1. [Missing data](#Missing-data)
    1. [Engineering](#Engineering)
    1. [Encoding](#Encoding)
    1. [Transformation](#Transformation)
        1. [Polynomial features](#Polynomial-features)
        1. [Skew](#Skew)
        1. [Scale](#Scale)
    1. [Outliers](#Outliers)
1. [Feature importance](#Feature-importance)    
1. [Modeling](#Modeling)
    1. [Data preparation](#Data-preparation-1)
    1. [Bayesian hyper-parameter optimization](#Bayesian-hyper-parameter-optimization)
    1. [Model performance evaluation - standard models](#Model-performance-evaluation-standard-models)
    1. [Validation set evaluation - standard models](#Validation-set-evaluation-standard-models)
    1. [Model explanability](#Model-explanability)
1. [Stacking](#Stacking)
    1. [Primary models](#Primary-models)
    1. [Meta model](#Meta-model)                
    1. [Model performance evaluation - stacked models](#Model-performance-evaluation-stacked-models)
    1. [Validation set evaluation - stacked models](#Validation-set-evaluation-stacked-models)


# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [None]:
# standard libary and settings
import os
import sys
import importlib
import itertools
from functools import reduce
import time; rundate = time.strftime("%Y%m%d")

import warnings
warnings.simplefilter("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np
np.set_printoptions(threshold=np.inf, suppress=True)

import pandas as pd
pd.set_option("display.max_rows", 500); pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# modeling extensions
import sklearn.base as base
import sklearn.datasets as datasets
import sklearn.ensemble as ensemble
import sklearn.impute as impute
import sklearn.linear_model as linear_model
import sklearn.neighbors as neighbors
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing
import sklearn.svm as svm

import lightgbm
import xgboost

from hyperopt import hp

import eif
import shap
shap.initjs()
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

%matplotlib inline

try:
    #     import mlmachine as mlm
    #     from prettierplot.plotter import PrettierPlot
    #     import prettierplot.style as style
    import asdfasd
except ModuleNotFoundError:
    sys.path.append(
        "../../../mlmachine"
    ) if "../../../../mlmachine" not in sys.path else None
    sys.path.append(
        "../../../prettierplot"
    ) if "../../../../prettierplot" not in sys.path else None

    import mlmachine as mlm
    import mlmachine.data as data
    from mlmachine.features.preprocessing import (
        DataFrameSelector,
        PlayWithPandas,
        UnprocessedColumnAdder,
        ContextImputer,
        PandasFeatureUnion,
        DualTransformer,
    )
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
else:
    print(
        "This notebook relies on the libraries mlmachine and prettierplot. Please run:"
    )
    print("\tpip install mlmachine")
    print("\tpip install prettierplot")

## Data

<a id = 'Data'></a>

In [None]:
# load data and print dimensions
dataset = data.attrition()
# data = pd.read_csv("s3://tdp-ml-datasets/kaggle-employee-attrition/IbmEmployeeAttrition.csv")

print("Training data dimensions: {}".format(dataset.shape))

In [None]:
# display info and first 5 rows
data.info()
display(data[:5])

In [None]:
# review counts of different column types
dataset.dtypes.value_counts()

In [None]:
# split dataset into train and validation datasets
dfTrain, dfValid = mlm.trainTestCompile(data=dataset, targetCol='Attrition')

In [None]:
# Load training data into mlmachine
train = mlm.Machine(
    data=dfTrain,
    target=["Attrition"],
    removeFeatures=["EmployeeNumber","EmployeeCount","StandardHours","Over18"],
    forceToCategorical=["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel","JobSatisfaction",
                    "MaritalStatus","PerformanceRating","RelationshipSatisfaction",
                   "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance"],
    targetType="categorical",
)
print(train.data.shape)

In [None]:
# Load training data into mlmachine
valid = mlm.Machine(
    data=dfValid,
    target=["Attrition"],
    removeFeatures=["EmployeeNumber","EmployeeCount","StandardHours","Over18"],
    forceToCategorical=["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel","JobSatisfaction",
                    "MaritalStatus","PerformanceRating","RelationshipSatisfaction",
                   "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance"],
    targetType="categorical",
)
print(valid.data.shape)

# EDA

<a id = 'EDA'></a>

## Categorical feature EDA

<a id = 'Categorical-feature-EDA'></a>

##### Univariate & feature vs. target

In [None]:
# categorical features
for feature in train.featureType["categorical"]:
    train.edaCatTargetCatFeat(feature=feature)

## Numeric feature EDA

<a id = 'numeric-feature-EDA'></a>

##### Univariate & feature vs. target

In [None]:
# numeric features
for feature in train.featureType["numeric"]:
    train.edaCatTargetNumFeat(feature=feature)

##### Correlation

###### Correlation (all samples)

In [None]:
# correlation heat map
p = PrettierPlot()
ax = p.makeCanvas()
p.prettyCorrHeatmap(df=train.data, annot=False, ax=ax)

###### Correlation (top vs. target)

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plotOrientation='tall')
ax = p.makeCanvas()
p.prettyCorrHeatmapTarget(
    df=train.data, target=train.target, thresh=0.02, annot=True, ax=ax
)

##### Pair plot

In [None]:
# pair plot
p = PrettierPlot(chartProp=12)
p.prettyPairPlot(df=train.data, cols=train.featureType['numeric'], diag_kind="auto")

In [None]:
# pair plot
p = PrettierPlot(chartProp=12)
p.prettyPairPlot(
    df=train.data.dropna(),
    diag_kind="kde",
    target=train.target,
    cols=train.featureType['numeric'][:10],
    legendLabels=["Stays", "Leaves"],
    bbox=(2.0, 0.0),
)

## Faceting

<a id = 'Faceting'></a>

##### Split bars

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, MaritalStatus vs. Gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.recombineData(train.data, train.target),
    x="MaritalStatus",
    y=train.target.name,
    split="Gender",
    yUnits="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, BusinessTravel vs. Gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.recombineData(train.data, train.target),
    x="BusinessTravel",
    y=train.target.name,
    split="Gender",
    yUnits="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, JobSatisfaction vs. Gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.recombineData(train.data, train.target),
    x="JobSatisfaction",
    y=train.target.name,
    split="Gender",
    yUnits="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, JobSatisfaction by Education", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.recombineData(train.data, train.target),
    x="JobSatisfaction",
    y=train.target.name,
    split="Education",
    yUnits="fff",
    bbox = (1.3, 0.8),
    ax=ax,
    legendLabels = ['Below College','College','Bachelor','Master','Doctor']
)

##### Points plots

In [None]:
#
p = PrettierPlot()
p.prettyFacetTwoCatPoint(
    df=train.recombineData(train.data, train.target),
    x="Education",
    y=train.target.name,
    split="Gender",
    catCol="JobSatisfaction",
    height=5,
    bbox=(1.3, 1.2),
#     legendLabels=["1st class", "2nd class", "3rd class"],
)

In [None]:
#
p = PrettierPlot()
p.prettyFacetTwoCatPoint(
    df=train.recombineData(train.data, train.target),
    x="BusinessTravel",
    y=train.target.name,
    split="Gender",
    catCol="MaritalStatus",
    aspect = 1.4,
    height=5,
    bbox=(1.3, 1.2),
#     legendLabels=["1st class", "2nd class", "3rd class"],
)

In [None]:
# #
# p = PrettierPlot()
# p.prettyFacetCatNumHist(
#     df=train.edaData(train.data, train.target),
#     split=train.target.name,
#     legendLabels=["Died", "Lived"],
#     catRow="Sex",
#     catCol="Embarked",
#     numCol="Age",
#     bbox=(1.9, 1.0),
#     height=4,
#     aspect=1,
# )

In [None]:
# #
# p = PrettierPlot(chartProp=15)
# p.prettyFacetCatNumScatter(
#     df=train.edaData(train.data, train.target),
#     split=train.target.name,
#     legendLabels=["Died", "Lived"],
#     catRow="Sex",
#     catCol="Embarked",
#     xNum="Fare",
#     yNum="Age",
#     bbox=(1.9, 1.0),
#     height=4,
#     aspect=1,
# )

## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# null score
pd.Series(train.target).value_counts(normalize=True)

# Data preparation

<a id = 'Data-preparation'></a>

## Missing data

<a id = 'Missing-data'></a>

##### Training

In [None]:
# evaluate missing data
train.edaMissingSummary()

##### Validation

In [None]:
# evaluate missing data
valid.edaMissingSummary()

##### Training vs. validation


In [None]:
# compare feature with missing data
train.missingColCompare(train=train.data, validation=valid.data)

##### Impute

Nothing to impute

<a id = 'Impute'></a>

## Engineering

<a id = 'Engineering'></a>

##### Training

In [None]:
# evaluate additional features
for feature in train.featureType["categorical"]:
    train.edaCatTargetCatFeat(feature=feature)

##### Validation

## Encoding

<a id = 'Encoding'></a>

##### Training

In [None]:
# counts of unique values in training data string columns
train.data[train.featureType["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in train.data[train.featureType["categorical"]]:
    print(col, np.unique(train.data[col]))

##### Validation

In [None]:
# counts of unique values in validation data string columns
valid.data[valid.featureType["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in valid.data[valid.featureType["categorical"]]:
    if col not in [""]:
        print(col, np.unique(valid.data[col]))

##### Training vs. validation

In [None]:
# identify values that are present in the training data but not the validation data, and vice versa
for col in train.featureType["categorical"]:
    trainValues = train.data[col].unique()
    validValues = valid.data[col].unique()

    trainDiff = set(trainValues) - set(validValues)
    validDiff = set(validValues) - set(trainValues)

    if len(trainDiff) > 0 or len(validDiff) > 0:
        print("\n\n*** " + col)
        print("Value present in training data, not in validation data")
        print(trainDiff)
        print("Value present in validation data, not in training data")
        print(validDiff)
    else:
        print(' {} = fully represented'.format(col))

##### Encode

In [None]:
# encode pipeline
nominalColumns = ["MaritalStatus","EducationField","Department","Gender","JobRole","OverTime"]

ordinalColumns = ["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel",
                 "JobSatisfaction","PerformanceRating","RelationshipSatisfaction",
                 "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance","BusinessTravel"]
ordinalEncodings = [            
         [1, 2, 3, 4, 5], # Education
         [1, 2, 3, 4], # EnvironmentSatisfaction
         [1, 2, 3, 4], # JobInvolvement
         [1, 2, 3, 4, 5], # JobLevel
         [1, 2, 3, 4], # JobSatisfaction
         [3, 4], # PerformanceRating
         [1, 2, 3, 4], # RelationshipSatisfaction
         [0, 1, 2, 3], # StockOptionLevel
         [0, 1, 2, 3, 4, 5, 6], # TrainingTimesLastYear
         [1, 2, 3, 4], # WorkLifeBalance
         ['Non-Travel','Travel_Rarely','Travel_Frequently'], # BusinessTravel
    ]

encodePipe = PandasFeatureUnion([
    ("ordinal", pipeline.make_pipeline(
        DataFrameSelector(ordinalColumns),
        PlayWithPandas(preprocessing.OrdinalEncoder(categories=ordinalEncodings)),
    )),
    ("nominal", pipeline.make_pipeline(
        DataFrameSelector(nominalColumns),
        PlayWithPandas(preprocessing.OneHotEncoder()),
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(nominalColumns + ordinalColumns))),
    )),
])

train.data = encodePipe.fit_transform(train.data)
valid.data = encodePipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

## Transformation

<a id = 'Transformation'></a>

### Polynomial features

<a id = 'Polynomial-features'></a>

##### Transformation

In [None]:
# polynomial pipe
polynomialPipe = PandasFeatureUnion([
    ("polynomial", pipeline.make_pipeline(
        DataFrameSelector(train.featureType["numeric"]),
        PlayWithPandas(preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False))
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(train.featureType["numeric"]))),
    )),
])

train.data = polynomialPipe.fit_transform(train.data)
valid.data = polynomialPipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

### Skew

<a id = 'Skew'></a>

##### Training

In [None]:
# evaluate skew of numeric features - training data
train.skewSummary()

##### Validation

In [None]:
# evaluate skew of numeric features - validation data
valid.skewSummary()

##### Transform

In [None]:
# # skew pipeline
# skewPipe = PandasFeatureUnion([
#     ("skew", pipeline.make_pipeline(
#         DataFrameSelector(train.featureType["numeric"]),
#         DualTransformer(),
#     )),
#     ("diff", pipeline.make_pipeline(
#         DataFrameSelector(list(set(train.data.columns).difference(train.featureType["numeric"]))),
#     )),
# ])

# train.data = skewPipe.fit_transform(train.data)
# valid.data = skewPipe.transform(valid.data)

# train.featureTypeUpdate()
# valid.featureTypeUpdate()

### Scale

<a id = 'Scale'></a>

##### Transformation

In [None]:
# scale pipeline
scalePipe = PandasFeatureUnion([
    ("scale", pipeline.make_pipeline(
        DataFrameSelector(train.featureType["numeric"]),
        PlayWithPandas(preprocessing.StandardScaler())
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(train.featureType["numeric"]))),
    )),
])

train.data = scalePipe.fit_transform(train.data)
valid.data = scalePipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

## Outliers


<a id = 'Outliers'></a>

In [None]:
# identify outliers using IQR
trainPipe = pipeline.Pipeline([
    ("outlier",train.OutlierIQR(
                outlierCount=5,
                iqrStep=1.5,
                features=train.featureType["numeric"],
                dropOutliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
iqrOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(iqrOutliers)

In [None]:
# identify outliers using Isolation Forest
clf = ensemble.IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.01
)
clf.fit(train.data[train.data.columns])
preds = clf.predict(train.data[train.data.columns])

# evaluate index values
mask = np.isin(preds, -1)
ifOutliers = np.array(train.data[mask].index)
print(ifOutliers)

In [None]:
# identify outliers using extended isolation forest
trainPipe = pipeline.Pipeline([
    ("outlier",train.ExtendedIsoForest(
                cols=train.featureType["numeric"],
                nTrees=100,
                sampleSize=256,
                ExtensionLevel=1,
                anomaliesRatio=0.03,
                dropOutliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
eifOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(eifOutliers)

In [None]:
# identify outliers that are identified in multiple algorithms
outliers = reduce(np.intersect1d, (iqrOutliers, ifOutliers, eifOutliers))
# outliers = reduce(np.intersect1d, (ifOutliers, eifOutliers))
print(outliers)

In [None]:
# review outlier identification summary
outlierSummary = train.outlierSummary(iqrOutliers=iqrOutliers,
                             ifOutliers=ifOutliers,
                             eifOutliers=eifOutliers
                            )
outlierSummary[outlierSummary["Count"] >= 3].index

In [None]:
# remove outlers from predictors and response
outliers = np.array([123, 63, 976, 237, 126, 914, 473, 187, 270, 875, 1116, 427])
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

# Feature importance

<a id = 'Feature-importance'></a>

In [None]:
# generate feature importance summary
estimators = [
    lightgbm.LGBMClassifier,
    ensemble.RandomForestClassifier,
    ensemble.GradientBoostingClassifier,
    ensemble.ExtraTreesClassifier,
    ensemble.AdaBoostClassifier,
    xgboost.XGBClassifier,
]

fs = train.FeatureSelector(
    data=train.data, target=train.target, estimators=estimators, rank=True
)
featureSelectorSummary = fs.featureSelectorSuite()

In [None]:
# calculate cross-validation performance
estimators = [
    svm.SVC,
    lightgbm.LGBMClassifier,
    linear_model.LogisticRegression,
    xgboost.XGBClassifier,
    ensemble.RandomForestClassifier,
    ensemble.GradientBoostingClassifier,
    #ensemble.AdaBoostClassifier,
    ensemble.ExtraTreesClassifier,
    neighbors.KNeighborsClassifier,
]

cvSummary = fs.featureSelectorCrossVal(
    scoring=["accuracy","roc_auc"],
    nFolds=8,
    step=1
)

###### Accuracy

In [None]:
# visualize CV performance for diminishing feature set
fs.featureSelectorResultsPlot(
    metric="accuracy",
    titleScale=0.8,
)

In [None]:
df = fs.featuresUsedSummary(metric="accuracy")
df

In [None]:
# list feature that showed up in at least X models
df[df["count"] >= 6].index

###### ROC AUC

In [None]:
# visualize CV performance for diminishing feature set
fs.featureSelectorResultsPlot(
    metric="roc_auc",
    titleScale=0.8,
)

In [None]:
df = fs.featuresUsedSummary(metric="roc_auc")
df

In [None]:
df[df["count"] >= 6].index

# Modeling

<a id = 'Modeling'></a>

## Data preparation

<a id = 'Data-preparation-1'></a>

In [None]:
#################################################################################
# import data
dataset = data.attrition()
# data = pd.read_csv("s3://tdp-ml-datasets/kaggle-employee-attrition/IbmEmployeeAttrition.csv")

# split dataset into train and validation datasets
dfTrain, dfValid = mlm.trainTestCompile(data=dataset, targetCol='Attrition')

# import training data
train = mlm.Machine(
    data=dfTrain,
    target=["Attrition"],
    removeFeatures=["EmployeeNumber","EmployeeCount","StandardHours","Over18"],
    forceToCategorical=["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel","JobSatisfaction",
                    "MaritalStatus","PerformanceRating","RelationshipSatisfaction",
                   "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance"],
    targetType="categorical",
)

# import valid data
valid = mlm.Machine(
    data=dfValid,
    target=["Attrition"],
    removeFeatures=["EmployeeNumber","EmployeeCount","StandardHours","Over18"],
    forceToCategorical=["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel","JobSatisfaction",
                    "MaritalStatus","PerformanceRating","RelationshipSatisfaction",
                   "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance"],
    targetType="categorical",)

#################################################################################
# feature transformation pipeline
nominalColumns = ["MaritalStatus","EducationField","Department","Gender","JobRole","OverTime"]

ordinalColumns = ["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel",
                 "JobSatisfaction","PerformanceRating","RelationshipSatisfaction",
                 "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance","BusinessTravel"]
ordinalEncodings = [            
         [1, 2, 3, 4, 5], # Education
         [1, 2, 3, 4], # EnvironmentSatisfaction
         [1, 2, 3, 4], # JobInvolvement
         [1, 2, 3, 4, 5], # JobLevel
         [1, 2, 3, 4], # JobSatisfaction
         [3, 4], # PerformanceRating
         [1, 2, 3, 4], # RelationshipSatisfaction
         [0, 1, 2, 3], # StockOptionLevel
         [0, 1, 2, 3, 4, 5, 6], # TrainingTimesLastYear
         [1, 2, 3, 4], # WorkLifeBalance
         ['Non-Travel','Travel_Rarely','Travel_Frequently'], # BusinessTravel
    ]

transformPipe = PandasFeatureUnion([
    ("ordinal", pipeline.make_pipeline(
        DataFrameSelector(ordinalColumns),
        PlayWithPandas(preprocessing.OrdinalEncoder(categories=ordinalEncodings)),
    )),
    ("nominal", pipeline.make_pipeline(
        DataFrameSelector(nominalColumns),
        PlayWithPandas(preprocessing.OneHotEncoder()),
    )),
    ("numeric", pipeline.make_pipeline(
        DataFrameSelector(train.featureType["numeric"]),
        PlayWithPandas(preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
#         DualTransformer(),
        PlayWithPandas(preprocessing.StandardScaler())
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(nominalColumns + ordinalColumns + train.featureType["numeric"]))),
    )),
])

train.data = transformPipe.fit_transform(train.data)
valid.data = transformPipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

#################################################################################
# remove outliers
outliers = np.array([123, 63, 976, 237, 126, 914, 473, 187, 270, 875, 1116, 427])
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

# accuracy >= 7
# bestCols = ['Age*Fare','Title_2','Fare*FamilySize','Sex_male','Fare','Pclass','CabinQuarter_X']
bestCols = ['MonthlyIncome*PercentSalaryHike', 'EnvironmentSatisfaction',
       'Age*YearsAtCompany', 'Age*MonthlyIncome', 'JobSatisfaction',
       'StockOptionLevel', 'JobInvolvement', 'MonthlyIncome', 'OverTime_No',
       'Age*HourlyRate', 'HourlyRate*TotalWorkingYears',
       'HourlyRate*MonthlyIncome', 'DailyRate*MonthlyIncome', 'BusinessTravel',
       'OverTime_Yes', 'Age*PercentSalaryHike', 'HourlyRate*YearsAtCompany',
       'WorkLifeBalance', 'Age*DailyRate',
       'DistanceFromHome*PercentSalaryHike'] # ROC
# bestCols = ['MonthlyIncome*PercentSalaryHike', 'EnvironmentSatisfaction',
#        'Age*YearsAtCompany', 'Age*MonthlyIncome', 'JobSatisfaction',
#        'StockOptionLevel', 'JobInvolvement', 'MonthlyIncome', 'OverTime_No',
#        'Age*HourlyRate', 'HourlyRate*TotalWorkingYears',
#        'HourlyRate*MonthlyIncome', 'DailyRate*MonthlyIncome', 'BusinessTravel',
#        'OverTime_Yes'] # accuracy
train.data = train.data[bestCols]
valid.data = valid.data[bestCols]

# print('completed')

## Bayesian hyper-parameter optimization

<a id = 'Bayesian-hyper-parameter-optimization'></a>

In [None]:
# model/parameter space
allSpace = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None, "balanced"]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "boosting_type": hp.choice("boosting_type", ["gbdt", "dart", "goss"])
        # ,'boosting_type': hp.choice('boosting_type'
        #                    ,[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'goss', 'subsample': 1.0}])
        ,
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_samples": hp.uniform("min_child_samples", 20, 500),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "num_leaves": hp.uniform("num_leaves", 8, 150),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.uniform("subsample_for_bin", 20000, 400000),
    },
    "linear_model.LogisticRegression": {
        "C": hp.loguniform("C", np.log(0.001), np.log(0.2)),
        "penalty": hp.choice("penalty", ["l1", "l2"]),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "gamma": hp.uniform("gamma", 0.0, 10),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_weight": hp.uniform("min_child_weight", 1, 20),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.5, 1),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.AdaBoostClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "algorithm": hp.choice("algorithm", ["SAMME", "SAMME.R"]),
    },
    "ensemble.ExtraTreesClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "criterion": hp.choice("criterion", ["gini", "entropy"]),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 0.00001, 10),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovo", "ovr"]),
        "gamma": hp.uniform("gamma", 0.00001, 10),
    },
    "neighbors.KNeighborsClassifier": {
        "algorithm": hp.choice("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 20, dtype=int)),
        "weights": hp.choice("weights", ["distance", "uniform"]),
    },
}

In [None]:
# execute bayesian optimization grid search
train.execBayesOptimSearch(
    allSpace=allSpace,
    data=train.data,
    target=train.target,
    scoring="roc_auc",
    nFolds=8,
    nJobs=8,
    iters=2000,
    verbose=0,
)

##### Model loss by iteration

In [None]:
# read scores summary table
bayesOptimSummary = pd.read_csv("", na_values="nan")
bayesOptimSummary[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayesOptimSummary["estimator"]):
    train.modelLossPlot(bayesOptimSummary=bayesOptimSummary, estimator=estimator)

##### Parameter selection by iteration

In [None]:
# estimator parameter plots
for estimator in np.unique(bayesOptimSummary["estimator"]):
    train.modelParamPlot(
        bayesOptimSummary=bayesOptimSummary,
        estimator=estimator,
        allSpace=allSpace,
        nIter=100,
        chartProp=15,
    )

In [None]:
sampleSpace = {
                'param': hp.uniform('param', np.log(0.4), np.log(0.6))
#     "": 0.000001 + hp.uniform("gamma", 0.000001, 10)
    #             'param2': hp.loguniform('param2', np.log(0.001), np.log(0.01))
}

train.samplePlot(sampleSpace, 1000)

## Model performance evaluation - standard models

<a id = 'Model-performance-evaluation-standard-models'></a>

In [None]:
topModels = train.topBayesOptimModels(bayesOptimSummary=bayesOptimSummary, numModels=1)
topModels

In [None]:
# classification panel, single model
estimator = "xgboost.XGBClassifier"; modelIter = 218
# estimator = 'ensemble.GradientBoostingClassifier'; modelIter = 590
# estimator = 'xgboost.XGBClassifier'; modelIter = 380

model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummary, estimator=estimator, modelIter=modelIter
)

train.classificationPanel(
    model=model, XTrain=train.data, yTrain=train.target, cmLabels=['Stays', 'Quits']
)

In [None]:
# create classification reports
for estimator, modelIters in topModels.items():
    for modelIter in modelIters:
        model = train.BayesOptimModelBuilder(
            bayesOptimSummary=bayesOptimSummary,
            estimator=estimator,
            modelIter=modelIter,
        )
        train.classificationPanel(
            model=model, XTrain=train.data, yTrain=train.target, cmLabels=['Stays', 'Quits']
        )

## Validation set evaluation - standard models

<a id = 'Validation-set-evaluation-standard-models'></a>

In [None]:
## standard model fit and predict
# select estimator and iteration
# estimator = "lightgbm.LGBMClassifier"; modelIter = 476
estimator = "xgboost.XGBClassifier"; modelIter = 418
# estimator = "ensemble.RandomForestClassifier"; modelIter = 382
# estimator = "ensemble.GradientBoostingClassifier"; modelIter = 238
# estimator = "svm.SVC"; modelIter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummary, estimator=estimator, modelIter=modelIter
)

# classification panel for validation data
train.classificationPanel(
    model=model,
    XTrain=train.data,
    yTrain=train.target,
    XValid=valid.data,
    yValid=valid.target,
    cmLabels=['Stays', 'Quits'],
)

In [None]:
# create classification reports
for estimator, modelIters in topModels.items():
    for modelIter in modelIters:
        model = train.BayesOptimModelBuilder(
            bayesOptimSummary=bayesOptimSummary,
            estimator=estimator,
            modelIter=modelIter,
        )
        train.classificationPanel(
            model=model,
            XTrain=train.data,
            yTrain=train.target,
            XValid=valid.data,
            yValid=valid.target,
            labels=[0, 1],
        )

## Model explanability


<a id = 'Feature-importance'></a>

In [None]:
# 
estimator = "ensemble.ExtraTreesClassifier"; modelIter = 145
estimator = "xgboost.XGBClassifier"; modelIter = 218

model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummary, estimator=estimator, modelIter=modelIter
)

model.fit(train.data.values, train.target.values)

##### Permutation importance

In [None]:
# permutation importance - how much does performance decrease when shuffling a certain feature?
perm = PermutationImportance(model.model, random_state=1).fit(train.data, train.target)
eli5.show_weights(perm, feature_names=featureNames)

##### SHAP values - training data

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in train.data.index[:5]:
    train.singleShapVizTree(obsIx=i, model=model, data=train.data)

###### Force plots -multiple observations

In [None]:
# SHAP force plot a set of data
visual = train.multiShapVizTree(obsIxs=train.data.index, model=model, data=train.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obsData, _, obsShapValues = train.multiShapValueTree(
    obsIxs=train.data.index, model=model, data=train.data
)

In [None]:
# SHAP dependence plot grid
gridFeatures = [
    "BusinessTravel",
    "Age",
    "WorkLifeBalance",
    "Education",
    "DistanceFromHome",
    "MonthlyIncome",
    "Gender_Male",
]

train.shapDependenceGrid(
    obsData=obsData,
    obsShapValues=obsShapValues,
    gridFeatures=gridFeatures,
    allFeatures=train.data.columns,
    dotSize=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.makeCanvas()

train.shapDependencePlot(
    obsData=obsData,
    obsShapValues=obsShapValues,
    scatterFeature="Age",
    colorFeature="BusinessTravel",
    featureNames=train.data.columns,
    dotSize=50,
    alpha=0.5,
    ax=ax
)

In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
featureNames = train.data.columns.tolist()
topShap = np.argsort(-np.sum(np.abs(obsShapValues), 0))

for topIx in topShap:
    p = PrettierPlot()
    ax = p.makeCanvas()

    train.shapDependencePlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        scatterFeature=featureNames[topIx],
        colorFeature="Age",
        featureNames=featureNames,
        dotSize=50,
        alpha=0.5,
        ax=ax
    )

###### Summary plots

In [None]:
# SHAP summary plot
train.shapSummaryPlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        featureNames=train.data.columns,
    )

##### SHAP values - validation data

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in valid.data.index[:5]:
    valid.singleShapVizTree(obsIx=i, model=model, data=valid.data)

###### Force plots -multiple observations

In [None]:
# SHAP force plot a set of data
visual = valid.multiShapVizTree(obsIxs=valid.data.index, model=model, data=valid.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obsData, _, obsShapValues = valid.multiShapValueTree(
    obsIxs=valid.data.index, model=model, data=valid.data
)

In [None]:
# SHAP dependence plot grid
gridFeatures = [
    "BusinessTravel",
    "Age",
    "WorkLifeBalance",
    "Education",
    "DistanceFromHome",
    "MonthlyIncome",
    "Gender_Male",
]

valid.shapDependenceGrid(
    obsData=obsData,
    obsShapValues=obsShapValues,
    gridFeatures=gridFeatures,
    allFeatures=valid.data.columns,
    dotSize=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.makeCanvas()

valid.shapDependencePlot(
    obsData=obsData,
    obsShapValues=obsShapValues,
    scatterFeature="Age",
    colorFeature="BusinessTravel",
    featureNames=valid.data.columns,
    dotSize=50,
    alpha=0.5,
    ax=ax
)

In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
featureNames = valid.data.columns.tolist()
topShap = np.argsort(-np.sum(np.abs(obsShapValues), 0))

for topIx in topShap:
    p = PrettierPlot()
    ax = p.makeCanvas()

    valid.shapDependencePlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        scatterFeature=featureNames[topIx],
        colorFeature="Age",
        featureNames=featureNames,
        dotSize=50,
        alpha=0.5,
        ax=ax
    )

###### Summary plots

In [None]:
# SHAP summary plot
valid.shapSummaryPlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        featureNames=valid.data.columns,
    )

# Stacking

<a id = 'Stacking'></a>

## Primary models

<a id = 'Primary-models'></a>

In [None]:
# get out-of-fold predictions
oofTrain, oofValid, columns = train.modelStacker(
    models=topModels,
    bayesOptimSummary=bayesOptimSummary,
    XTrain=train.data.values,
    yTrain=train.target.values,
    XValid=valid.data.values,
    nFolds=10,
    nJobs=10,
)

In [None]:
# view correlations of predictions
p = PrettierPlot()
ax = p.makeCanvas()
p.prettyCorrHeatmap(
    df=pd.DataFrame(oofTrain, columns=columns), annot=True, ax=ax, vmin=0
)

## Meta model

<a id = 'Meta-model'></a>

In [None]:
# parameter space
allSpace = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 0.00000001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr", "ovo"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
}

In [None]:
# execute bayesian optimization grid search
train.execBayesOptimSearch(
    allSpace=allSpace,
    resultsDir="{}_hyperopt_meta_{}.csv".format(rundate, analysis),
    X=oofTrain,
    y=train.target,
    scoring="f1_micro",
    nFolds=8,
    nJobs=10,
    iters=1000,
    verbose=0,
)

In [None]:
# read scores summary table
analysis = "attrition"
rundate = "20190807"
bayesOptimSummaryMeta = pd.read_csv("{}_hyperopt_meta_{}.csv".format(rundate, analysis))
bayesOptimSummaryMeta[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayesOptimSummaryMeta["estimator"]):
    train.modelLossPlot(bayesOptimSummary=bayesOptimSummaryMeta, estimator=estimator)

In [None]:
# estimator parameter plots
for estimator in np.unique(bayesOptimSummaryMeta['estimator']):
    train.modelParamPlot(bayesOptimSummary = bayesOptimSummaryMeta,
                         estimator=estimator,
                         allSpace=allSpace,
                         nIter=100,
                         chartProp=15)

## Model performance evaluation - stacked models

<a id = 'Model-performance-evaluation-stacked-models'></a>

In [None]:
topModels = train.topBayesOptimModels(
    bayesOptimSummary=bayesOptimSummaryMeta, numModels=1
)
topModels

In [None]:
# best second level learning model
estimator = "lightgbm.LGBMClassifier"; modelIter = 668
# estimator = "xgboost.XGBClassifier"; modelIter = 380
# estimator = "ensemble.RandomForestClassifier"; modelIter = 411
# estimator = "ensemble.GradientBoostingClassifier"; modelIter = 590
# estimator = "svm.SVC"; modelIter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummaryMeta, estimator=estimator, modelIter=modelIter
)
train.classificationPanel(
    model=model, XTrain=oofTrain, yTrain=train.target, labels=[0, 1]
)

In [None]:
# create classification reports
for estimator, modelIters in topModels.items():
    for modelIter in modelIters:
        model = train.BayesOptimModelBuilder(
            bayesOptimSummary=bayesOptimSummaryMeta,
            estimator=estimator,
            modelIter=modelIter,
        )
        train.classificationPanel(
            model=model, XTrain=oofTrain, yTrain=train.target, labels=[0, 1], nFolds=4
        )

## Validation set evaluation - stacked models


<a id = 'Validation-set-evaluation-stacked-models'></a>

In [None]:
## standard model fit and predict
# select estimator and iteration
estimator = "lightgbm.LGBMClassifier"; modelIter = 668
# estimator = "xgboost.XGBClassifier"; modelIter = 380
# estimator = "ensemble.RandomForestClassifier"; modelIter = 411
# estimator = "ensemble.GradientBoostingClassifier"; modelIter = 590
# estimator = "svm.SVC"; modelIter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummaryMeta, estimator=estimator, modelIter=modelIter
)
model.fit(oofTrain, train.target.values)

# fit model and make predictions
yPred = model.predict(oofValid)

In [None]:
train.classificationPanel(
    model=model,
    XTrain=oofTrain,
    yTrain=train.target,
    XValid=oofValid,
    yValid=valid.target,
    labels=[0, 1],
)

In [None]:
# create classification reports
for estimator, modelIters in topModels.items():
    for modelIter in modelIters:
        model = train.BayesOptimModelBuilder(
            bayesOptimSummary=bayesOptimSummaryMeta,
            estimator=estimator,
            modelIter=modelIter,
        )
        train.classificationPanel(
            model=model,
            XTrain=oofTrain,
            yTrain=train.target,
            XValid=oofValid,
            yValid=valid.target,
            labels=[0, 1],
        )