 __Project - Employee attrition__

1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [EDA](#EDA)
    1. [Continuous feature EDA](#Continuous-feature-EDA)
    1. [Count feature EDA](#Count-feature-EDA)
    1. [Continuous feature EDA](#Continuous-feature-EDA)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    
1. [Data preparation](#Data-preparation)
    1. [Missing data](#Missing-data)
    1. [Feature engineering](#Feature-engineering)
        1. [Handcrafted](#Handcrafted)
        1. [Polynomial features](#Polynomial-features)
        1. [Encoding](#Encoding)
    1. [Feature transformation](#Feature-transformation)
        1. [Skew correction](#Skew-correction)
        1. [Scaling](#Scaling)     
    1. [Outliers](#Outliers)
    1. [Additional exploratory data analysis](#Additional-exploratory-data-analysis)
1. [Feature importance](#Feature-importance)    
1. [Modeling](#Modeling)
    1. [Data preparation](#Data-preparation-1)
    1. [Bayesian hyper-parameter optimization](#Bayesian-hyper-parameter-optimization)
    1. [Model performance evaluation - standard models](#Model-performance-evaluation-standard-models)
    1. [Validation set evaluation - standard models](#Validation-set-evaluation-standard-models)
    1. [Model explanability](#Model-explanability)
1. [Stacking](#Stacking)
    1. [Primary models](#Primary-models)
    1. [Meta model](#Meta-model)                
    1. [Model performance evaluation - stacked models](#Model-performance-evaluation-stacked-models)
    1. [Validation set evaluation - stacked models](#Validation-set-evaluation-stacked-models)


# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [None]:
# standard libary and settings
import copy
import os
import sys
import importlib
import itertools
from functools import reduce
import time

rundate = time.strftime("%Y%m%d")

import warnings

warnings.simplefilter("ignore")

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)

import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
# pd.options.display.float_format = "{:,.6f}".format

# modeling extensions
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.datasets import (
    load_boston,
    load_wine,
    load_iris,
    load_breast_cancer,
    make_blobs,
    make_moons,
)
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    IsolationForest,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (
    Lasso,
    Ridge,
    ElasticNet,
    LinearRegression,
    LogisticRegression,
    SGDRegressor,
)
from sklearn.model_selection import (
    KFold,
    train_test_split,
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    RandomizedSearchCV,
)
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    PolynomialFeatures,
    OrdinalEncoder,
    LabelEncoder,
    OneHotEncoder,
    KBinsDiscretizer,
    QuantileTransformer,
    PowerTransformer,
    MinMaxScaler,
)
from sklearn.svm import SVC, SVR
from category_encoders import (
    WOEEncoder,
    TargetEncoder,
    CatBoostEncoder,
    BinaryEncoder,
    CountEncoder,
)

from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from hyperopt import hp

import eif
import shap

shap.initjs()

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import squarify

%matplotlib inline

try:
    #     import mlmachine as mlm
    #     from prettierplot.plotter import PrettierPlot
    #     import prettierplot.style as style
    import asdfasd
except ModuleNotFoundError:
    sys.path.append(
        "../../../mlmachine"
    ) if "../../../../mlmachine" not in sys.path else None
    sys.path.append(
        "../../../prettierplot"
    ) if "../../../../prettierplot" not in sys.path else None

    import mlmachine as mlm
    import mlmachine.data as data
    from mlmachine.features.preprocessing import (
        DataFrameSelector,
        PandasPipeline,
        KFoldSelectEncoder,
        ContextImputer,
        PandasFeatureUnion,
        DualTransformer,
    )
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
else:
    print(
        "This notebook relies on the libraries mlmachine and prettierplot. Please run:"
    )
    print("\tpip install mlmachine")
    print("\tpip install prettierplot")

## Data

<a id = 'Data'></a>

In [None]:
# load data and print dimensions
dataset = data.attrition()
print("Training data dimensions: {}".format(dataset.shape))

In [None]:
# display info and first 5 rows
dataset.info()
display(dataset[:5])

In [None]:
# review counts of different column types
dataset.dtypes.value_counts()

In [None]:
# split dataset into train and validation datasets
df_train, df_valid = mlm.train_test_df_compile(data=dataset, target_col='Attrition')

In [None]:
continuous = [
    "Age",
    "DailyRate",
    "DistanceFromHome",
    "HourlyRate",
    "MonthlyIncome",
    "MonthlyRate",
    "PercentSalaryHike",
    "TotalWorkingYears",
    "YearsAtCompany",
    "YearsInCurrentRole",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager",
]

count = [
    "NumCompaniesWorked",
    "TrainingTimesLastYear",
]

nominal = [
    "MaritalStatus",
    "EducationField",
    "Department",
    "Gender",
    "JobRole",
    "OverTime",
]

remove_features = [
    "EmployeeNumber",
    "EmployeeCount",
    "StandardHours",
    "Over18",
]

ordinal = [
    "Education",
    "EnvironmentSatisfaction",
    "JobInvolvement",
    "JobLevel",
    "JobSatisfaction",
    "PerformanceRating",
    "RelationshipSatisfaction",
    "StockOptionLevel",
    "WorkLifeBalance",
    "BusinessTravel",
]

ordinal_encodings = {
    "Education": [1, 2, 3, 4, 5],
    "EnvironmentSatisfaction": [1, 2, 3, 4],
    "JobInvolvement": [1, 2, 3, 4],
    "JobLevel": [1, 2, 3, 4, 5],
    "JobSatisfaction": [1, 2, 3, 4],
    "PerformanceRating": [3, 4],
    "RelationshipSatisfaction": [1, 2, 3, 4],
    "StockOptionLevel": [0, 1, 2, 3],
    "WorkLifeBalance": [1, 2, 3, 4],
    "BusinessTravel": ['Non-Travel','Travel_Rarely','Travel_Frequently'],
}

In [None]:
# Load training data into mlmachine
train = mlm.Machine(
    data=df_train,
    target="Attrition",
    remove_features=remove_features,
    identify_as_continuous=continuous,
    identify_as_count=count,    
    identify_as_nominal=nominal,
    identify_as_ordinal=ordinal,
    ordinal_encodings=ordinal_encodings,
    target_type="category",
)

In [None]:
# Load training data into mlmachine
valid = mlm.Machine(
    data=df_valid,
    target="Attrition",
    remove_features=remove_features,
    identify_as_continuous=continuous,
    identify_as_count=count,    
    identify_as_nominal=nominal,
    identify_as_ordinal=ordinal,
    ordinal_encodings=ordinal_encodings,
)

# EDA

<a id = 'EDA'></a>

## Category feature EDA

<a id = 'Category-feature-EDA'></a>

In [None]:
# number features
for feature in train.data.mlm_dtypes["category"]:
    train.eda_cat_target_cat_feat(
        feature=feature,
        level_count_cap=20,
        legend_labels=["Stayed","Left"],
        chart_scale=15
    )

## Count feature EDA

<a id = 'Count-feature-EDA'></a>

In [None]:
# number features
for feature in train.data.mlm_dtypes["count"]:
    train.eda_cat_target_cat_feat(
        feature=feature,
        level_count_cap=20,
        legend_labels=["Stayed","Left"],
        chart_scale=15
    )

## Continuous feature EDA

<a id = 'Continuous-feature-EDA'></a>

In [None]:
# number features
for feature in train.data.mlm_dtypes["continuous"]:
    train.eda_cat_target_num_feat(
        feature=feature,
#         outliers_out_of_scope=10,
        legend_labels=["Stayed","Left"],
        chart_scale=12
    )

##### Correlation

In [None]:
# correlation heat map
p = PrettierPlot()
ax = p.make_canvas()
p.corr_heatmap(df=train.data, annot=False, ax=ax)

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plot_orientation='tall')
ax = p.make_canvas()
p.corr_heatmap_target(
    df=train.data, target=train.target, thresh=0.02, annot=True, ax=ax
)

In [None]:
# pair plot
p = PrettierPlot(chart_scale=12)
p.pair_plot(df=train.data, columns=train.data.mlm_dtypes['continuous'], diag_kind="auto")

In [None]:
# pair plot
p = PrettierPlot(chart_scale=12)
p.pair_plot(
    df=train.data.dropna(),
    diag_kind="kde",
    target=train.target,
    columns=train.data.mlm_dtypes['continuous'][:10],
    legend_labels=["Stayed","Left"],
    bbox=(2.0, 0.0),
)

## Faceting

<a id = 'Faceting'></a>

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chart_scale=12)
ax = p.make_canvas(title="Attrition, MaritalStatus vs. Gender", y_shift=0.7)
p.facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="MaritalStatus",
    y=train.target.name,
    split="Gender",
    y_units="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chart_scale=12)
ax = p.make_canvas(title="Attrition, BusinessTravel vs. Gender", y_shift=0.7)
p.facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="BusinessTravel",
    y=train.target.name,
    split="Gender",
    y_units="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chart_scale=12)
ax = p.make_canvas(title="Attrition, JobSatisfaction vs. Gender", y_shift=0.7)
p.facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="JobSatisfaction",
    y=train.target.name,
    split="Gender",
    y_units="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chart_scale=12)
ax = p.make_canvas(title="Attrition, JobSatisfaction by Education", y_shift=0.7)
p.facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="JobSatisfaction",
    y=train.target.name,
    split="Education",
    y_units="fff",
    bbox = (1.3, 0.8),
    ax=ax,
    legend_labels = ['Below College','College','Bachelor','Master','Doctor']
)

In [None]:
#
p = PrettierPlot()
p.facet_two_cat_point(
    df=train.recombine_data(train.data, train.target),
    x="Education",
    y=train.target.name,
    split="Gender",
    cat_col="JobSatisfaction",
    height=5,
    bbox=(1.3, 1.2),
#     legend_labels=["1st class", "2nd class", "3rd class"],
)

In [None]:
#
p = PrettierPlot()
p.facet_two_cat_point(
    df=train.recombine_data(train.data, train.target),
    x="BusinessTravel",
    y=train.target.name,
    split="Gender",
    cat_col="MaritalStatus",
    aspect = 1.4,
    height=5,
    bbox=(1.3, 1.2),
)

In [None]:
#
p = PrettierPlot()
p.facet_cat_num_hist(
    df=train.recombine_data(train.data, train.target),
    split=train.target.name,
    legend_labels=["Stayed", "Left"],
    cat_row="Gender",
    cat_col="Education",
    num_col="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)

In [None]:
#
p = PrettierPlot()
p.facet_cat_num_hist(
    df=train.recombine_data(train.data, train.target),
    split=train.target.name,
    legend_labels=["Stayed", "Left"],
    cat_row="Gender",
    cat_col="MaritalStatus",
    num_col="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1.5,
)

## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# null score
pd.Series(train.target).value_counts(normalize=True)

# Data preparation

<a id = 'Data-preparation'></a>

## Missing data

<a id = 'Missing-data'></a>

In [None]:
# evaluate missing data
train.eda_missing_summary()

In [None]:
# evaluate missing data
valid.eda_missing_summary()

In [None]:
# compare feature with missing data
train.missing_col_compare(train=train.data, validation=valid.data)

<a id = 'Impute'></a>

## Feature engineering

<a id = 'Feature-engineering'></a>

### Handcrafted

<a id = 'Handcrafted'></a>

### Polynomial features

<a id = 'Polynomial-features'></a>

In [None]:
# transform pipe
polynomial_pipe = PandasFeatureUnion([
    ("polynomial", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["continuous"]),
        PandasPipeline(PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["continuous"]),
    )),
])

train.data = polynomial_pipe.fit_transform(train.data)
valid.data = polynomial_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

### Encoding

<a id = 'Encoding'></a>

In [None]:
# counts of unique values in training data string columns
train.data[train.data.mlm_dtypes["category"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each object columns
for col in train.data[train.data.mlm_dtypes["category"]]:
    print(col, np.unique(train.data[col]))

In [None]:
# counts of unique values in validation data string columns
valid.data[valid.data.mlm_dtypes["category"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each object columns
for col in valid.data[valid.data.mlm_dtypes["category"]]:
    print(col, np.unique(valid.data[col]))

In [None]:
# identify values that are present in the training data but not the validation data, and vice versa
for col in train.data.mlm_dtypes["category"]:
    train_values = train.data[col].unique()
    valid_values = valid.data[col].unique()

    train_diff = set(train_values) - set(valid_values)
    valid_diff = set(valid_values) - set(train_values)

    if len(train_diff) > 0 or len(valid_diff) > 0:
        print("\n\n*** " + col)
        print("Value present in training data, not in validation data")
        print(train_diff)
        print("Value present in validation data, not in training data")
        print(valid_diff)
    else:
        print(' {} = fully represented'.format(col))

In [None]:
# encode pipeline
encode_pipe = PandasFeatureUnion([
    ("nominal", make_pipeline(
        DataFrameSelector(include_columns=nominal),
        PandasPipeline(OneHotEncoder(drop="first")),
    )),
    ("ordinal", make_pipeline(
        DataFrameSelector(include_columns=list(ordinal_encodings.keys())),
        PandasPipeline(OrdinalEncoder(categories=list(ordinal_encodings.values()))),
    )),
    ("bin", make_pipeline(
        DataFrameSelector(include_columns=train.data.mlm_dtypes["continuous"]),
        PandasPipeline(KBinsDiscretizer(encode="ordinal")),
    )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_columns=nominal + list(ordinal_encodings.keys())),
    )),
])

train.data = encode_pipe.fit_transform(train.data)
valid.data = encode_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

In [None]:
# encode_pipe = Pipeline([
#     ('nominal', OneHotEncoder()),
# ])

# encode_pipe.fit_transform(train.data[["MaritalStatus","Gender"]]).todense()[:10]



In [None]:
# train.data[["MaritalStatus","Gender"]].head(10)

In [None]:
# encode_pipe = PandasFeatureUnion([
#     ("nominal", make_pipeline(
#         PandasPipeline(OneHotEncoder()),
#     )),
# ])
# encode_pipe.fit_transform(train.data[["MaritalStatus","Gender"]]).head(10)

In [None]:
# encode_pipe = FeatureUnion([
#     ("nominal", make_pipeline(
#         OneHotEncoder(),
#     )),
# ])
# encode_pipe.fit_transform(train.data[["MaritalStatus","Gender"]]).todense()[:10]

In [None]:
#
target_encode_pipe = PandasFeatureUnion([
    ("target", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["category"]),
        KFoldSelectEncoder(
            target=train.target,
            cv=KFold(n_splits=5, shuffle=False, random_state=0),
            encoder=TargetEncoder,
        ),
    )),
    ("woe", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["category"]),
        KFoldSelectEncoder(
            target=train.target,
            cv=KFold(n_splits=5, shuffle=False, random_state=0),
            encoder=WOEEncoder,
        ),
    )),
    ("catboost", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["category"]),
        KFoldSelectEncoder(
            target=train.target,
            cv=KFold(n_splits=5, shuffle=False, random_state=0),
            encoder=CatBoostEncoder,
        ),
    )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["category"]),
    )),
])

train.data = target_encode_pipe.fit_transform(train.data)
valid.data = target_encode_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

## Feature transformation

<a id = 'Feature-transformation'></a>

### Skew correction

<a id = 'Skew-correction'></a>

In [None]:
# skew correction pipeline
skew_pipe = PandasFeatureUnion([
    ("skew", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["continuous"]),
        DualTransformer(),
    )),    
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["continuous"]),
    )),
])

# train.data = skew_pipe.fit_transform(train.data)
# valid.data = skew_pipe.transform(valid.data)

# train.update_dtypes()
# valid.update_dtypes()

In [None]:
# evaluate skew of number features - training data
train.skew_summary()

In [None]:
# evaluate skew of number features - validation data
valid.skew_summary()

### Scaling

<a id = 'Scaling'></a>

In [None]:
# scale pipeline
scale_pipe = PandasFeatureUnion([
    ("scale", make_pipeline(
        DataFrameSelector(),
        PandasPipeline(RobustScaler())
    )),
])

train.data = scale_pipe.fit_transform(train.data)
valid.data = scale_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

## Outliers


<a id = 'Outliers'></a>

In [None]:
# identify outliers using IQR
train_pipe = Pipeline([
    ("outlier",train.OutlierIQR(
                outlier_count=5,
                iqr_step=1.5,
                features=train.data.mlm_dtypes["continuous"],
                drop_outliers=False,))
    ])
train.data = train_pipe.transform(train.data)

# capture outliers
iqr_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers))
print(iqr_outliers)

In [None]:
# identify outliers using Isolation Forest
clf = IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.01
)
clf.fit(train.data[train.data.columns])
preds = clf.predict(train.data[train.data.columns])

# evaluate index values
mask = np.isin(preds, -1)
if_outliers = np.array(train.data[mask].index)
print(if_outliers)

In [None]:
# identify outliers using extended isolation forest
train_pipe = Pipeline([
    ("outlier",train.ExtendedIsoForest(
                columns=train.data.mlm_dtypes["continuous"],
                n_trees=100,
                sample_size=256,
                extension_level=1,
                anomalies_ratio=0.03,
                drop_outliers=False,))
    ])
train.data = train_pipe.transform(train.data)

# capture outliers
eif_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers))
print(eif_outliers)

In [None]:
# identify outliers that are identified in multiple algorithms
outliers = reduce(np.intersect1d, (iqr_outliers, if_outliers, eif_outliers))
# outliers = reduce(np.intersect1d, (if_outliers, eif_outliers))
print(outliers)

In [None]:
# review outlier identification summary
outlier_summary = train.outlier_summary(iqr_outliers=iqr_outliers,
                             if_outliers=if_outliers,
                             eif_outliers=eif_outliers
                            )
outlier_summary[outlier_summary["count"] >= 3].index

In [None]:
# remove outlers from predictors and response
outliers = np.array([123, 63, 976, 237, 126, 914, 473, 187, 270, 875, 1116, 427])
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

## Additional exploratory data analysis

<a id = 'Additional-exploratory-data-analysis'></a>

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plot_orientation='tall',chart_scale=15)
ax = p.make_canvas()
p.corr_heatmap_target(
    df=train.data,
    target=train.target,
    thresh=0.2,
    annot=True,
    ax=ax,
)

# Feature importance

<a id = 'Feature-importance'></a>

In [None]:
# generate feature importance summary
estimators = [
    LGBMClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    XGBClassifier,
]

fs = train.FeatureSelector(
    data=train.data,
    target=train.target,
    estimators=estimators,
)
feature_selector_summary = fs.feature_selector_suite(
    save_to_csv=True,
    n_jobs=5,
)

In [None]:
# calculate cross-validation performance
estimators = [
    SVC,
    LGBMClassifier,
    LogisticRegression,
    XGBClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier,
    #AdaBoostClassifier,
    ExtraTreesClassifier,
    KNeighborsClassifier,
]

cv_summary = fs.feature_selector_cross_val(
    feature_selector_summary=feature_selector_summary,
#     feature_selector_summary=pd.read_csv("feature_selection_summary_2001291515.csv", index_col=0),
    estimators=estimators,
    scoring=["accuracy","roc_auc"],
    n_folds=5,
    step=1,
    n_jobs=4,
    verbose=True,
)

###### Accuracy

In [None]:
# visualize CV performance for diminishing feature set
fs.feature_selector_results_plot(
#     cv_summary=cv_summary,
#     feature_selector_summary=feature_selector_summary,
    cv_summary= pd.read_csv(".csv", index_col=0),
    feature_selector_summary=pd.read_csv("feature_selection_summary_2001291515.csv", index_col=0),
    scoring="accuracy",
    title_scale=0.8,
)

In [None]:
cross_val_features_df = fs.create_cross_val_features_df(
    scoring="accuracy",
    cv_summary=cv_summary,
    feature_selector_summary=feature_selector_summary,
#     cv_summary= pd.read_csv("cv_summary_2001291517.csv", index_col=0),
#     feature_selector_summary=pd.read_csv("feature_selection_summary_2001291333.csv", index_col=0),
)

In [None]:
cross_val_feature_dict = fs.create_cross_val_features_dict(
    cross_val_features_df=cross_val_features_df
)

###### ROC AUC

In [None]:
# visualize CV performance for diminishing feature set
fs.feature_selector_results_plot(
    metric="roc_auc",
    title_scale=0.8,
)

In [None]:
cross_val_features_df = fs.create_cross_val_features_df(
    scoring="roc_auc",
    cv_summary=cv_summary,
    feature_selector_summary=feature_selector_summary,
#     cv_summary= pd.read_csv("cv_summary_2001291517.csv", index_col=0),
#     feature_selector_summary=pd.read_csv("feature_selection_summary_2001291333.csv", index_col=0),
)

In [None]:
cross_val_feature_dict = fs.create_cross_val_features_dict(
    cross_val_features_df=cross_val_features_df
)

# Modeling

<a id = 'Modeling'></a>

## Data preparation

<a id = 'Data-preparation-1'></a>

In [None]:
#################################################################################
# import data
dataset = data.attrition()

# split dataset into train and validation datasets
df_train, df_valid = mlm.train_test_df_compile(data=dataset, target_col='Attrition')

continuous = [
    "Age",
    "DailyRate",
    "DistanceFromHome",
    "HourlyRate",
    "MonthlyIncome",
    "MonthlyRate",
    "PercentSalaryHike",
    "TotalWorkingYears",
    "YearsAtCompany",
    "YearsInCurrentRole",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager",
]

count = [
    "NumCompaniesWorked",
    "TrainingTimesLastYear",
]

nominal = [
    "MaritalStatus",
    "EducationField",
    "Department",
    "Gender",
    "JobRole",
    "OverTime",
]

remove_features = [
    "EmployeeNumber",
    "EmployeeCount",
    "StandardHours",
    "Over18",
]

ordinal = [
    "Education",
    "EnvironmentSatisfaction",
    "JobInvolvement",
    "JobLevel",
    "JobSatisfaction",
    "PerformanceRating",
    "RelationshipSatisfaction",
    "StockOptionLevel",
    "WorkLifeBalance",
    "BusinessTravel",
]

ordinal_encodings = {
    "Education": [1, 2, 3, 4, 5],
    "EnvironmentSatisfaction": [1, 2, 3, 4],
    "JobInvolvement": [1, 2, 3, 4],
    "JobLevel": [1, 2, 3, 4, 5],
    "JobSatisfaction": [1, 2, 3, 4],
    "PerformanceRating": [3, 4],
    "RelationshipSatisfaction": [1, 2, 3, 4],
    "StockOptionLevel": [0, 1, 2, 3],
    "WorkLifeBalance": [1, 2, 3, 4],
    "BusinessTravel": ['Non-Travel','Travel_Rarely','Travel_Frequently'],
}

# import training data
# Load training data into mlmachine
train = mlm.Machine(
    data=df_train,
    target="Attrition",
    remove_features=remove_features,
    identify_as_continuous=continuous,
    identify_as_count=count,    
    identify_as_nominal=nominal,
    identify_as_ordinal=ordinal,
    ordinal_encodings=ordinal_encodings,
    target_type="category",
)

# import valid data
# Load training data into mlmachine
valid = mlm.Machine(
    data=df_valid,
    target="Attrition",
    remove_features=remove_features,
    identify_as_continuous=continuous,
    identify_as_count=count,    
    identify_as_nominal=nominal,
    identify_as_ordinal=ordinal,
    ordinal_encodings=ordinal_encodings,
)

#################################################################################
### feature transformation pipeline

# polynomial feature pipe
polynomial_pipe = PandasFeatureUnion([
    ("polynomial", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["continuous"]),
        PandasPipeline(PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["continuous"]),
    )),
])

train.data = polynomial_pipe.fit_transform(train.data)
valid.data = polynomial_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

# encode feature pipeline
encode_pipe = PandasFeatureUnion([
    ("nominal", make_pipeline(
        DataFrameSelector(include_columns=nominal),
        PandasPipeline(OneHotEncoder(drop="first")),
    )),
    ("ordinal", make_pipeline(
        DataFrameSelector(include_columns=list(ordinal_encodings.keys())),
        PandasPipeline(OrdinalEncoder(categories=list(ordinal_encodings.values()))),
    )),
    ("bin", make_pipeline(
        DataFrameSelector(include_columns=train.data.mlm_dtypes["continuous"]),
        PandasPipeline(KBinsDiscretizer(encode="ordinal")),
    )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_columns=nominal + list(ordinal_encodings.keys())),
    )),
])

train.data = encode_pipe.fit_transform(train.data)
valid.data = encode_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

# target encoded feature pipeline
#
target_encode_pipe = PandasFeatureUnion([
    ("target", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["category"]),
        KFoldSelectEncoder(
            target=train.target,
            cv=KFold(n_splits=5, shuffle=False, random_state=0),
            encoder=TargetEncoder,
        ),
    )),
    ("woe", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["category"]),
        KFoldSelectEncoder(
            target=train.target,
            cv=KFold(n_splits=5, shuffle=False, random_state=0),
            encoder=WOEEncoder,
        ),
    )),
    ("catboost", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["category"]),
        KFoldSelectEncoder(
            target=train.target,
            cv=KFold(n_splits=5, shuffle=False, random_state=0),
            encoder=CatBoostEncoder,
        ),
    )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["category"]),
    )),
])

train.data = target_encode_pipe.fit_transform(train.data)
valid.data = target_encode_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

# skew correction pipeline
skew_pipe = PandasFeatureUnion([
    ("skew", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["continuous"]),
        DualTransformer(),
    )),    
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["continuous"]),
    )),
])

# train.data = skew_pipe.fit_transform(train.data)
# valid.data = skew_pipe.transform(valid.data)

# train.update_dtypes()
# valid.update_dtypes()

# scale pipeline
scale_pipe = PandasFeatureUnion([
    ("scale", make_pipeline(
        DataFrameSelector(),
        PandasPipeline(RobustScaler())
    )),
])

train.data = scale_pipe.fit_transform(train.data)
valid.data = scale_pipe.transform(valid.data)

train.update_dtypes()
valid.update_dtypes()

#################################################################################
# remove outliers
outliers = np.array([123, 63, 976, 237, 126, 914, 473, 187, 270, 875, 1116, 427])
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

# accuracy >= 7
# train.data = train.data[best_columns]
# valid.data = valid.data[best_columns]

print('completed')

## Bayesian hyper-parameter optimization

<a id = 'Bayesian-hyper-parameter-optimization'></a>

In [None]:
# model/parameter space
all_space = {
    "LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None, "balanced"]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "boosting_type": hp.choice("boosting_type", ["gbdt", "dart", "goss"])
        # ,'boosting_type': hp.choice('boosting_type'
        #                    ,[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'goss', 'subsample': 1.0}])
        ,
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_samples": hp.uniform("min_child_samples", 20, 500),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "num_leaves": hp.uniform("num_leaves", 8, 150),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.5),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.uniform("subsample_for_bin", 20000, 400000),
    },
#     "LogisticRegression": {
#         "C": hp.loguniform("C", np.log(0.001), np.log(0.2)),
#         "penalty": hp.choice("penalty", ["l1", "l2"]),
#     },
    "XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "gamma": hp.uniform("gamma", 0.0, 10),
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_weight": hp.uniform("min_child_weight", 1, 20),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "subsample": hp.uniform("subsample", 0.3, 1),
    },
    "RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "AdaBoostClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
        "algorithm": hp.choice("algorithm", ["SAMME", "SAMME.R"]),
    },
    "ExtraTreesClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "criterion": hp.choice("criterion", ["gini", "entropy"]),
    },
    "SVC": {
        "C": hp.uniform("C", 0.001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovo", "ovr"]),
        "gamma": hp.uniform("gamma", 0.000000001, 5),
    },
    "KNeighborsClassifier": {
        "algorithm": hp.choice("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 20, dtype=int)),
        "weights": hp.choice("weights", ["distance", "uniform"]),
    },
}

In [None]:
# execute bayesian optimization grid search
train.exec_bayes_optim_search(
    all_space=all_space,
    data=train.data,
    target=train.target,
    scoring="roc_auc",
    n_folds=5,
    n_jobs=8,
    iters=2000,
    verbose=0,
)

##### Model loss by iteration

In [None]:
# read scores summary table
bayes_optim_summary = pd.read_csv("", na_values="nan")
bayes_optim_summary[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayes_optim_summary["estimator"]):
    train.model_loss_plot(bayes_optim_summary=bayes_optim_summary, estimator=estimator)

##### Parameter selection by iteration

In [None]:
# estimator parameter plots
for estimator in np.unique(bayes_optim_summary["estimator"]):
    train.modelParamPlot(
        bayes_optim_summary=bayes_optim_summary,
        estimator=estimator,
        all_space=all_space,
        n_iter=100,
        chart_scale=15,
    )

In [None]:
sample_space = {
                'param': hp.uniform('param', np.log(0.4), np.log(0.6))
#     "": 0.000001 + hp.uniform("gamma", 0.000001, 10)
    #             'param2': hp.loguniform('param2', np.log(0.001), np.log(0.01))
}

train.sample_plot(sample_space, 1000)

## Model performance evaluation - standard models

<a id = 'Model-performance-evaluation-standard-models'></a>

In [None]:
top_models = train.top_bayes_optim_models(bayes_optim_summary=bayes_optim_summary, num_models=1)
top_models

In [None]:
# classification panel, single model
estimator = "XGBClassifier"; model_iter = 218
# estimator = 'GradientBoostingClassifier'; model_iter = 590
# estimator = 'XGBClassifier'; model_iter = 380

model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)

train.classification_panel(
    model=model, X_train=train.data, y_train=train.target, cm_labels=['Stays', 'Quits']
)

In [None]:
# create classification reports
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = train.BayesOptimModelBuilder(
            bayes_optim_summary=bayes_optim_summary,
            estimator=estimator,
            model_iter=model_iter,
        )
        train.classification_panel(
            model=model, X_train=train.data, y_train=train.target, cm_labels=['Stays', 'Quits']
        )

## Validation set evaluation - standard models

<a id = 'Validation-set-evaluation-standard-models'></a>

In [None]:
## standard model fit and predict
# select estimator and iteration
# estimator = "LGBMClassifier"; model_iter = 476
estimator = "XGBClassifier"; model_iter = 418
# estimator = "RandomForestClassifier"; model_iter = 382
# estimator = "GradientBoostingClassifier"; model_iter = 238
# estimator = "SVC"; model_iter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)

# classification panel for validation data
train.classification_panel(
    model=model,
    X_train=train.data,
    y_train=train.target,
    X_valid=valid.data,
    y_valid=valid.target,
    cm_labels=['Stays', 'Quits'],
)

In [None]:
# create classification reports
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = train.BayesOptimModelBuilder(
            bayes_optim_summary=bayes_optim_summary,
            estimator=estimator,
            model_iter=model_iter,
        )
        train.classification_panel(
            model=model,
            X_train=train.data,
            y_train=train.target,
            X_valid=valid.data,
            y_valid=valid.target,
            labels=[0, 1],
        )

## Model explanability


<a id = 'Feature-importance'></a>

In [None]:
# 
estimator = "ExtraTreesClassifier"; model_iter = 145
estimator = "XGBClassifier"; model_iter = 218

model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)

model.fit(train.data.values, train.target.values)

##### Permutation importance

In [None]:
# permutation importance - how much does performance decrease when shuffling a certain feature?
perm = PermutationImportance(model.model, random_state=1).fit(train.data, train.target)
eli5.show_weights(perm, feature_names=feature_names)

##### SHAP values - training data

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in train.data.index[:5]:
    train.single_shap_viz_tree(obsIx=i, model=model, data=train.data)

###### Force plots -multiple observations

In [None]:
# SHAP force plot a set of data
visual = train.multi_shap_viz_tree(obs_ixs=train.data.index, model=model, data=train.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = train.multi_shap_value_tree(
    obs_ixs=train.data.index, model=model, data=train.data
)

In [None]:
# SHAP dependence plot grid
grid_features = [
    "BusinessTravel",
    "Age",
    "WorkLifeBalance",
    "Education",
    "DistanceFromHome",
    "MonthlyIncome",
    "Gender_Male",
]

train.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=train.data.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

train.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="Age",
    color_feature="BusinessTravel",
    feature_names=train.data.columns,
    dot_size=50,
    alpha=0.5,
    ax=ax
)

In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = train.data.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()

    train.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Age",
        feature_names=feature_names,
        dot_size=50,
        alpha=0.5,
        ax=ax
    )

###### Summary plots

In [None]:
# SHAP summary plot
train.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=train.data.columns,
    )

##### SHAP values - validation data

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in valid.data.index[:5]:
    valid.single_shap_viz_tree(obsIx=i, model=model, data=valid.data)

###### Force plots -multiple observations

In [None]:
# SHAP force plot a set of data
visual = valid.multi_shap_viz_tree(obs_ixs=valid.data.index, model=model, data=valid.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = valid.multi_shap_value_tree(
    obs_ixs=valid.data.index, model=model, data=valid.data
)

In [None]:
# SHAP dependence plot grid
grid_features = [
    "BusinessTravel",
    "Age",
    "WorkLifeBalance",
    "Education",
    "DistanceFromHome",
    "MonthlyIncome",
    "Gender_Male",
]

valid.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=valid.data.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

valid.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="Age",
    color_feature="BusinessTravel",
    feature_names=valid.data.columns,
    dot_size=50,
    alpha=0.5,
    ax=ax
)

In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = valid.data.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()

    valid.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Age",
        feature_names=feature_names,
        dot_size=50,
        alpha=0.5,
        ax=ax
    )

###### Summary plots

In [None]:
# SHAP summary plot
valid.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=valid.data.columns,
    )

# Stacking

<a id = 'Stacking'></a>

## Primary models

<a id = 'Primary-models'></a>

In [None]:
# get out-of-fold predictions
oof_train, oof_valid, columns = train.model_stacker(
    models=top_models,
    bayes_optim_summary=bayes_optim_summary,
    X_train=train.data.values,
    y_train=train.target.values,
    X_valid=valid.data.values,
    n_folds=10,
    n_jobs=10,
)

In [None]:
# view correlations of predictions
p = PrettierPlot()
ax = p.make_canvas()
p.corr_heatmap(
    df=pd.DataFrame(oof_train, columns=columns), annot=True, ax=ax, vmin=0
)

## Meta model

<a id = 'Meta-model'></a>

In [None]:
# parameter space
all_space = {
    "LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "SVC": {
        "C": hp.uniform("C", 0.00000001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr", "ovo"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
}

In [None]:
# execute bayesian optimization grid search
train.exec_bayes_optim_search(
    all_space=all_space,
    results_dir="{}_hyperopt_meta_{}.csv".format(rundate, analysis),
    X=oof_train,
    y=train.target,
    scoring="f1_micro",
    n_folds=8,
    n_jobs=10,
    iters=1000,
    verbose=0,
)

In [None]:
# read scores summary table
analysis = "attrition"
rundate = "20190807"
bayes_optim_summary_meta = pd.read_csv("{}_hyperopt_meta_{}.csv".format(rundate, analysis))
bayes_optim_summary_meta[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayes_optim_summary_meta["estimator"]):
    train.model_loss_plot(bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator)

In [None]:
# estimator parameter plots
for estimator in np.unique(bayes_optim_summary_meta['estimator']):
    train.modelParamPlot(bayes_optim_summary = bayes_optim_summary_meta,
                         estimator=estimator,
                         all_space=all_space,
                         n_iter=100,
                         chart_scale=15)

## Model performance evaluation - stacked models

<a id = 'Model-performance-evaluation-stacked-models'></a>

In [None]:
top_models = train.top_bayes_optim_models(
    bayes_optim_summary=bayes_optim_summary_meta, num_models=1
)
top_models

In [None]:
# best second level learning model
estimator = "LGBMClassifier"; model_iter = 668
# estimator = "XGBClassifier"; model_iter = 380
# estimator = "RandomForestClassifier"; model_iter = 411
# estimator = "GradientBoostingClassifier"; model_iter = 590
# estimator = "SVC"; model_iter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)
train.classification_panel(
    model=model, X_train=oof_train, y_train=train.target, labels=[0, 1]
)

In [None]:
# create classification reports
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = train.BayesOptimModelBuilder(
            bayes_optim_summary=bayes_optim_summary_meta,
            estimator=estimator,
            model_iter=model_iter,
        )
        train.classification_panel(
            model=model, X_train=oof_train, y_train=train.target, labels=[0, 1], n_folds=4
        )

## Validation set evaluation - stacked models


<a id = 'Validation-set-evaluation-stacked-models'></a>

In [None]:
## standard model fit and predict
# select estimator and iteration
estimator = "LGBMClassifier"; model_iter = 668
# estimator = "XGBClassifier"; model_iter = 380
# estimator = "RandomForestClassifier"; model_iter = 411
# estimator = "GradientBoostingClassifier"; model_iter = 590
# estimator = "SVC"; model_iter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)
model.fit(oof_train, train.target.values)

# fit model and make predictions
y_pred = model.predict(oof_valid)

In [None]:
train.classification_panel(
    model=model,
    X_train=oof_train,
    y_train=train.target,
    X_valid=oof_valid,
    y_valid=valid.target,
    labels=[0, 1],
)

In [None]:
# create classification reports
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = train.BayesOptimModelBuilder(
            bayes_optim_summary=bayes_optim_summary_meta,
            estimator=estimator,
            model_iter=model_iter,
        )
        train.classification_panel(
            model=model,
            X_train=oof_train,
            y_train=train.target,
            X_valid=oof_valid,
            y_valid=valid.target,
            labels=[0, 1],
        )