__Kaggle competition - Titanic__

1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [Initial EDA](#Initial-EDA)
    1. [Categorical feature EDA](#Categorical-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target)
    1. [Continuous feature EDA](#Continuous-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target2)
        1. [Correlation](#Correlation)
        1. [Pair plot](#Pair-plot)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    
1. [Data preparation](#Data-preparation)
    1. [Outliers (preliminary)](#Outliers-preliminary)
        1. [Evaluate](#Evaluate)
        1. [Remove](#remove)
    1. [Missing data](#Missing-data)
        1. [Evaluate](#Evaluate1)
        1. [Impute](#Impute)
    1. [Engineering](#Engineering)
        1. [Evaluate](#Evaluate3)
        1. [Engineer](#Engineer)
    1. [Encoding](#Encoding)
        1. [Evaluate](#Evaluate2)
        1. [Encode](#Encode)
    1. [Transformation](#Transformation)
        1. [Evaluate](#Evaluate4)
        1. [Transform](#Transform)
    1. [Outliers (final)](#Outliers-final)
        1. [Evaluate](#Evaluate5)
        1. [Remove](#remove1)
1. [Data evaluation](#Data-evaluation)
    1. [Feature importance](#Feature-importance)    
    1. [Rationality](#Rationality)
    1. [Value override](#Value-override)
    1. [Continuous feature EDA](#Continuous-feature-EDA3)
    1. [Correlation](#Correlation3)
1. [Modeling](#Modeling)
    1. [Data preparation](#Data-preparation)
    1. [Bayesian hyper-parameter optimization](#Bayesian-hyper-parameter-optimization)
        1. [Model loss by iteration](#Model-loss-by-iteration)
        1. [Parameter selection by iteration](#Parameter-selection-by-iteration)
    1. [Model performance evaluation](#Model-performance-evaluation)
        1. [Classification report](#Classification-report)
        1. [Confusion matrix](#Confusion-matrix)
        1. [ROC curve](#ROC-curve)
    1. [Model explanability](#Model-explanability)
        1. [Permutation importance](#Permutation-importance)
        1. [Partial plots](#Partial-plots)
        1. [SHAP values](#SHAP-values)
    1. [Stacking](#Stacking)
        1. [Primary models](#Primary-models)
        1. [Meta model](#Meta-model)                
1. [Submission](#Submission)
    1. [Standard](#Standard)
    1. [Stack](#Stack)

# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [None]:
# standard libary and settings
import os
import sys
import importlib
import itertools
import csv
import ast
from timeit import default_timer as timer

global ITERATION
import time
from functools import reduce

rundate = time.strftime("%Y%m%d")

import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# modeling extensions
import sklearn.base as base
import sklearn.cluster as cluster
import sklearn.datasets as datasets
import sklearn.decomposition as decomposition
import sklearn.discriminant_analysis as discriminant_analysis
import sklearn.ensemble as ensemble
import sklearn.feature_extraction as feature_extraction
import sklearn.feature_selection as feature_selection
import sklearn.gaussian_process as gaussian_process
import sklearn.linear_model as linear_model
import sklearn.kernel_ridge as kernel_ridge
import sklearn.metrics as metrics
import sklearn.model_selection as model_selection
import sklearn.naive_bayes as naive_bayes
import sklearn.neighbors as neighbors
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing
import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.utils as utils

import eif

from scipy import stats, special
import xgboost
import lightgbm
import catboost

from hyperopt import hp, tpe, Trials, fmin, STATUS_OK
from hyperopt.pyll.stochastic import sample

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

%matplotlib inline

try:
    import mlmachine as mlm
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
except ModuleNotFoundError:
    sys.path.append("../../../mlmachine") if "../../../../mlmachine" not in sys.path else None
    sys.path.append("../../../prettierplot") if "../../../../prettierplot" not in sys.path else None
    
    import mlmachine as mlm
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
else:
    print('This notebook relies on the libraries mlmachine and prettierplot. Please run:')
    print('\tpip install mlmachine')
    print('\tpip install prettierplot')

## Data

<a id = 'Data'></a>

In [None]:
# load data and print dimensions
data = pd.read_csv("../../data/projectEmployeeAttrition/IbmEmployeeAttrition.csv")

print("Training data dimensions: {}".format(data.shape))

In [None]:
# display info and first 5 rows
data.info()
display(data[:5])

In [None]:
# review counts of different column types
data.dtypes.value_counts()

In [None]:
# split dataset into train and validation datasets
dfTrain, dfValid = mlm.trainTestCompile(data=data, targetCol='Attrition')

In [None]:
# Load training data into mlmachine
train = mlm.Machine(
    data=dfTrain,
    target=["Attrition"],
    removeFeatures=["EmployeeNumber","EmployeeCount","StandardHours","PerformanceRating","RelationshipSatisfaction",
                   "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance"],
    overrideCat=["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel","JobSatisfaction",
                    "MaritalStatus"],
    targetType="categorical",
)
print(train.data.shape)

In [None]:
# Load training data into mlmachine
valid = mlm.Machine(
    data=dfValid,
    target=["Attrition"],
    removeFeatures=["EmployeeNumber","EmployeeCount","StandardHours","PerformanceRating","RelationshipSatisfaction",
                   "StockOptionLevel","TrainingTimesLastYear","WorkLifeBalance"],
    overrideCat=["Education","EnvironmentSatisfaction","JobInvolvement","JobLevel","JobSatisfaction",
                    "MaritalStatus"],
    targetType="categorical",
)
print(valid.data.shape)

# Initial EDA

<a id = 'Initial-EDA'></a>

## Categorical feature EDA

<a id = 'Categorical-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target'></a>

In [None]:
# categorical features
train.edaCatTargetCatFeat(skipCols=[""])

## Continuous feature EDA

<a id = 'Continuous-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target2'></a>

In [None]:
# continuous features
train.edaCatTargetNumFeat()

### Correlation

<a id = 'Correlation'></a>

##### Correlation (all samples)

In [None]:
# correlation heat map
p = PrettierPlot()
ax = p.makeCanvas()
p.prettyCorrHeatmap(df=train.data, annot=False, ax=ax)

##### Correlation (top vs. target)

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plotOrientation='tall')
ax = p.makeCanvas()
p.prettyCorrHeatmapTarget(
    df=train.data, target=train.target, thresh=0.02, annot=True, ax=ax
)

> Remarks - There are three pairs of highly correlated features:
    - 'GarageArea' and 'GarageCars'
    - 'TotRmsAbvGrd' and 'GrLivArea'
    - '1stFlrSF' and 'TotalBsmtSF
This makes sense, given what each feature represents and how each pair items relate to each other. We likely only need one feature from each pair.

### Pair plot

<a id = 'Pair-plot'></a>

In [None]:
# pair plot
p = PrettierPlot(chartProp=12)
p.prettyPairPlot(df=train.data, cols=train.featureByDtype_['continuous'], diag_kind="auto")

In [None]:
# pair plot
p = PrettierPlot(chartProp=12)
p.prettyPairPlot(
    df=train.data.dropna(),
    diag_kind="kde",
    target=train.target,
    cols=train.featureByDtype_['continuous'][:10],
    legendLabels=["Stays", "Leaves"],
    bbox=(2.0, 0.0),
)

## Faceting

<a id = 'Faceting'></a>

##### Split bars

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, MaritalStatus vs. Gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.edaData(train.data, train.target),
    x="MaritalStatus",
    y=train.target.name,
    split="Gender",
    yUnits="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, BusinessTravel vs. Gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.edaData(train.data, train.target),
    x="BusinessTravel",
    y=train.target.name,
    split="Gender",
    yUnits="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, JobSatisfaction vs. Gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.edaData(train.data, train.target),
    x="JobSatisfaction",
    y=train.target.name,
    split="Gender",
    yUnits="fff",
    bbox = (1.2, 0.8),
    ax=ax,
)

In [None]:
# facet MaritalStatus vs. Gender
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Attrition, JobSatisfaction by Education", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.edaData(train.data, train.target),
    x="JobSatisfaction",
    y=train.target.name,
    split="Education",
    yUnits="fff",
    bbox = (1.3, 0.8),
    ax=ax,
    legendLabels = ['Below College','College','Bachelor','Master','Doctor']
)

##### Points plots

In [None]:
#
p = PrettierPlot()
p.prettyFacetTwoCatPoint(
    df=train.edaData(train.data, train.target),
    x="Education",
    y=train.target.name,
    split="Gender",
    catRow="JobSatisfaction",
    height=5,
    bbox=(1.3, 1.2),
#     legendLabels=["1st class", "2nd class", "3rd class"],
)

In [None]:
#
p = PrettierPlot()
p.prettyFacetTwoCatPoint(
    df=train.edaData(train.data, train.target),
    x="BusinessTravel",
    y=train.target.name,
    split="Gender",
    catRow="MaritalStatus",
    aspect = 1.5,
    height=5,
    bbox=(1.3, 1.2),
#     legendLabels=["1st class", "2nd class", "3rd class"],
)

##### a

In [None]:
train.featureByDtype_

In [None]:
# #
# p = PrettierPlot()
# p.prettyFacetCatNumHist(
#     df=train.edaData(train.data, train.target),
#     split=train.target.name,
#     legendLabels=["Died", "Lived"],
#     catRow="Sex",
#     catCol="Embarked",
#     numCol="Age",
#     bbox=(1.9, 1.0),
#     height=4,
#     aspect=1,
# )

In [None]:
# #
# p = PrettierPlot(chartProp=15)
# p.prettyFacetCatNumScatter(
#     df=train.edaData(train.data, train.target),
#     split=train.target.name,
#     legendLabels=["Died", "Lived"],
#     catRow="Sex",
#     catCol="Embarked",
#     xNum="Fare",
#     yNum="Age",
#     bbox=(1.9, 1.0),
#     height=4,
#     aspect=1,
# )

## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# null score
pd.Series(train.target).value_counts(normalize=True)

# Data preparation

<a id = 'Data-preparation'></a>

## Outliers (preliminary)


<a id = 'Outliers-preliminary'></a>

### Evaluate

<a id = 'Evaluate'></a>

##### Training outliers

In [None]:
# identify columns that have zero missing values
nonNull = train.data.columns[train.data.isnull().sum() == 0].values.tolist()

# identify intersection between non-null columns and continuous columns
nonNullNumCol = list(set(nonNull).intersection(train.featureByDtype_["continuous"]))
print(nonNullNumCol)

In [None]:
# identify outliers using IQR
trainPipe = pipeline.Pipeline([
    ("outlier",train.OutlierIQR(
                outlierCount=5,
                iqrStep=1.5,
                features=nonNullNumCol,
                dropOutliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
iqrOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(iqrOutliers)

In [None]:
# identify outliers using Isolation Forest
clf = ensemble.IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.02
)
clf.fit(train.data[nonNullNumCol])
preds = clf.predict(train.data[nonNullNumCol])

# evaluate index values
mask = np.isin(preds, -1)
ifOutliers = np.array(train.data[mask].index)
print(ifOutliers)

In [None]:
# identify outliers using extended isolation forest
trainPipe = pipeline.Pipeline([
    ("outlier",train.ExtendedIsoForest(
                cols=nonNullNumCol,
                nTrees=100,
                sampleSize=256,
                ExtensionLevel=1,
                anomaliesRatio=0.02,
                dropOutliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
eifOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(eifOutliers)

In [None]:
# identify outliers that are identified in multiple algorithms
outliers = reduce(np.intersect1d, (iqrOutliers, ifOutliers, eifOutliers))
# outliers = reduce(np.intersect1d, (ifOutliers, eifOutliers))
print(outliers)

In [None]:
# review outlier identification summary
outlierSummary = train.outlierSummary(iqrOutliers=iqrOutliers,
                             ifOutliers=ifOutliers,
                             eifOutliers=eifOutliers
                            )
outlierSummary

##### Validation outliers

### Remove

<a id = 'remove'></a>

In [None]:
# # remove outlers from predictors and response
# outliers = np.array([59,121])
# train.data = train.data.drop(outliers)
# train.target = train.target.drop(index=outliers)

## Missing data


<a id = 'Missing-data'></a>

### Evaluate

<a id = 'Evaluate1'></a>

##### Training missingness

In [None]:
# evaluate missing data
train.edaMissingSummary()

##### Validation missingness

In [None]:
# evaluate missing data
valid.edaMissingSummary()


##### Training vs. validation missingness


In [None]:
# compare feature with missing data
train.missingColCompare(train.data, valid.data)

### Impute

<a id = 'Impute'></a>

##### Impute training

##### Impute validation

## Engineering

<a id = 'Engineering'></a>

### Evaluate

<a id = 'Evaluate3'></a>

### Engineer

<a id = 'Engineer'></a>

##### Engineer training

In [None]:
# print new columns
for col in train.data.columns:
    if (
        col not in train.featureByDtype_["categorical"]
        and col not in train.featureByDtype_["continuous"]
    ):
        print(col)

In [None]:
# evaluate additional features
train.edaCatTargetCatFeat()

##### Engineer validation

In [None]:
# print new columns
for col in valid.data.columns:
    if (
        col not in valid.featureByDtype_["categorical"]
        and col not in valid.featureByDtype_["continuous"]
    ):
        print(col)

## Encoding

<a id = 'Encoding'></a>

### Evaluate

<a id = 'Evaluate2'></a>

##### Training feature evaluation

In [None]:
# counts of unique values in training data string columns
train.data[train.featureByDtype_["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in train.data[train.featureByDtype_["categorical"]]:
    try:
        print(col, np.unique(train.data[col]))
    except:
        pass

##### Validation feature evaluation

In [None]:
# counts of unique values in validation data string columns
valid.data[valid.featureByDtype_["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in valid.data[valid.featureByDtype_["categorical"]]:
    if col not in [""]:
        print(col, np.unique(valid.data[col]))

##### Training vs. validation

In [None]:
# identify values that are present in the training data but not the validation data, and vice versa
for col in train.featureByDtype_["categorical"]:
    if col not in [""]:
        trainValues = train.data[col].unique()
        validValues = valid.data[col].unique()

        trainDiff = set(trainValues) - set(validValues)
        validDiff = set(validValues) - set(trainValues)

        if len(trainDiff) > 0 or len(validDiff) > 0:
            print("\n\n*** " + col)
            print("Value present in training data, not in validation data")
            print(trainDiff)
            print("Value present in validation data, not in training data")
            print(validDiff)

### Encode

<a id = 'Encode'></a>

##### Encode training

In [None]:
# ordinal column encoding instructions
ordCatCols = {"Pclass": {1: 1, 2: 2, 3: 3}}

# nominal columns
nomCatCols = ["Embarked", "Sex", "CabinQuarter", "Title"]

# apply encodings to training data
trainPipe = pipeline.Pipeline(
    [
        ("encodeOrdinal", train.CustomOrdinalEncoder(encodings=ordCatCols)),
        ("dummyNominal", train.Dummies(cols=nomCatCols, dropFirst=True)),
    ]
)
train.data = trainPipe.transform(train.data)
train.data[:5]

##### Encode validation

In [None]:
# apply encodings to validation data
validPipe = pipeline.Pipeline(
    [
        ("encodeOrdinal", valid.CustomOrdinalEncoder(encodings=ordCatCols)),
        ("dummyNominal", valid.Dummies(cols=nomCatCols, dropFirst=False)),
        ("levels", valid.MissingDummies(trainCols=train.data.columns)),
    ]
)
valid.data = validPipe.transform(valid.data)
valid.data[:5]

## Transformation

<a id = 'Transformation'></a>

### Evaluate

<a id = 'Evaluate4'></a>

##### Training feature transformation

In [None]:
# evaluate skew of continuous features - training data
train.skewSummary()

##### Validation feature transformation

In [None]:
# evaluate skew of continuous features - validation data
valid.skewSummary()

### Transform

<a id = 'Transform'></a>

##### Transform training

In [None]:
# skew correct in training dataset, which also learns te best lambda value for each columns
trainPipe = pipeline.Pipeline([
        ("skew",train.SkewTransform(cols=train.featureByDtype_["continuous"], skewMin=0.75, pctZeroMax=1.0, verbose = True))
    ])
train.data = trainPipe.transform(train.data)
train.skewSummary()

##### Transform validation

In [None]:
# skew correction in validation dataset using lambdas learned on training data
validPipe = pipeline.Pipeline([
        ("skew",valid.SkewTransform(train=False, trainValue=trainPipe.named_steps["skew"].trainValue_))
    ])
valid.data = validPipe.transform(valid.data)
valid.skewSummary()

## Outliers (final)


<a id = 'Outliers-final'></a>

### Evaluate

<a id = 'Evaluate5'></a>

In [None]:
# identify outliers using IQR
trainPipe = pipeline.Pipeline([
        ("outlier",train.OutlierIQR(
                outlierCount=5,
                iqrStep=1.5,
                features=train.data.columns,
                dropOutliers=False,
            ))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
iqrOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(iqrOutliers)

In [None]:
# identify outliers using Isolation Forest
clf = ensemble.IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.01
)
clf.fit(train.data[train.data.columns])
preds = clf.predict(train.data[train.data.columns])

# evaluate index values
mask = np.isin(preds, -1)  # np.in1d if np.isin is not available
ifOutliers = np.array(train.data[mask].index)
print(ifOutliers)

In [None]:
# identify outliers using IQR
trainPipe = pipeline.Pipeline([
    ("outlier",train.ExtendedIsoForest(
                cols=train.data.columns,
                nTrees=100,
                sampleSize=256,
                ExtensionLevel=1,
                anomaliesRatio=0.02,
                dropOutliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
iqrOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(iqrOutliers)

In [None]:
# identify outliers that are identified in multiple algorithms
# reduce(np.intersect1d, (iqrOutliers, ifOutliers, eifOutliers))
reduce(np.intersect1d, (ifOutliers, eifOutliers))

In [None]:
# review outlier identification summary
outlierSummary = train.outlierSummary(iqrOutliers=iqrOutliers,
                             ifOutliers=ifOutliers,
                             eifOutliers=eifOutliers
                            )
outlierSummary

### Remove

<a id = 'remove1'></a>

In [None]:
# # remove outlers from predictors and response
# outliers = np.array([59,121])
# train.data = train.data.drop(outliers)
# train.target = train.target.drop(index=outliers)

# Data evaluation

<a id = 'Data evaluation'></a>

## Feature importance

<a id = 'Feature-importance'></a>

In [None]:
# feature importance summary table
featureImp = train.featureImportanceSummary()
featureImp

## Rationality

<a id = 'Rationality'></a>

In [None]:
# percent difference summary
dfDiff = abs(
    (
        ((valid.data.describe() + 1) - (train.data.describe() + 1))
        / (train.data.describe() + 1)
    )
    * 100
)
dfDiff = dfDiff[dfDiff.columns].replace({0: np.nan})
dfDiff[dfDiff < 0] = np.nan
dfDiff = dfDiff.fillna("")
display(dfDiff)
display(train.data[dfDiff.columns].describe())
display(valid.data[dfDiff.columns].describe())

## Value override

<a id = 'Value override'></a>

In [None]:
# change clearly erroneous value to what it probably was
# exploreValid.data['GarageYrBlt'].replace({2207 : 2007}, inplace = True)

## Continuous feature EDA

<a id = 'Continuous-feature-EDA3'></a>

## Correlation

<a id = 'Correlation3'></a>

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot()
ax = p.makeCanvas()
p.prettyCorrHeatmapTarget(df=train.data, target=train.target, thresh=0.2, ax=ax)

# Modeling

<a id = 'Modeling'></a>

## Data preparation

<a id = 'Data-preparation'></a>

##### Prepare training data

In [None]:
# import training data
dfTrain = pd.read_csv("../../data/kaggleTitanic/train.csv")
train = mlm.Machine(
    data=dfTrain,
    target=["Survived"],
    removeFeatures=["PassengerId", "Ticket"],
    overrideCat=["Pclass", "SibSp", "Parch"],
    targetType="categorical",
)


# remove outliers
outliers = np.array([27, 88, 258, 311, 341, 438, 679, 737, 742])
train.data = train.data.drop(train.data.index[outliers])
train.target = train.target.drop(index=outliers)

### pipeline
trainPipe = pipeline.Pipeline([
        ('imputeMedian', train.ContextImputer(nullCol = 'Age', contextCol = 'Parch', strategy = 'median'))     
        ,('imputeMode', train.ModeImputer(cols = ['Embarked']))
        ,('customBin', train.CustomBinner(customBinDict = customBinDict))
        ,('percentileBin', train.PercentileBinner(cols = ['Age','Fare'], percs = [10, 25, 50, 75, 90]))    
        ,('encodeOrdinal', train.CustomOrdinalEncoder(encodings = ordCatCols))    
        ,('dummyNominal', train.Dummies(cols = nomCatCols, dropFirst = True))
        ,('skew', train.SkewTransform(cols = train.featureByDtype_['continuous'], skewMin = 0.75, pctZeroMax = 1.0))    
    ])
train.data = trainPipe.transform(train.data)

# drop features
train.featureDropper(cols=["Name", "Cabin"])
print('completed')

##### Prepare validation data

In [None]:
### import valid data
dfValid = pd.read_csv("../../data/kaggleTitanic/test.csv")
valid = mlm.Machine(
    data=dfValid,
    removeFeatures=["PassengerId", "Ticket"],
    overrideCat=["Pclass", "SibSp", "Parch"],
)

### pipeline
validPipe = pipeline.Pipeline(
    [
        ("imputeMedian",valid.ContextImputer(nullCol="Age",contextCol="Parch",train=False,trainValue=trainPipe.named_steps["imputeMedian"].trainValue_)),
        ("imputeMedian2",valid.NumericalImputer(cols=["Fare", "Age"], strategy="median",train=False,trainValue=train.data)),
        ("customBin", valid.CustomBinner(customBinDict=customBinDict)),
        ("percentileBin",valid.PercentileBinner(train=False, trainValue=trainPipe.named_steps["percentileBin"].trainValue_)),
        ("encodeOrdinal", valid.CustomOrdinalEncoder(encodings=ordCatCols)),
        ("dummyNominal", valid.Dummies(cols=nomCatCols, dropFirst=False)),
        ("levels", valid.MissingDummies(trainCols=train.data.columns)),
        ("skew",valid.SkewTransform(train=False, trainValue=trainPipe.named_steps["skew"].trainValue_)),
    ]
)
valid.data = validPipe.transform(valid.data)
print('completed')

## Bayesian hyper-parameter optimization

<a id = 'Bayesian-hyper-parameter-optimization'></a>

In [None]:
# parameter space
allSpace = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "linear_model.LogisticRegression": {
        "C": hp.uniform("C", 0.04, 0.1),
        "penalty": hp.choice("penalty", ["l1"]),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.AdaBoostClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "learning_rate": hp.quniform("learning_rate", 0.1, 0.25, 0.01),
        "algorithm": hp.choice("algorithm", ["SAMME"]),
    },
    "naive_bayes.BernoulliNB": {"alpha": hp.uniform("alpha", 0.01, 2)},
    "ensemble.BaggingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_samples": hp.uniform("max_samples", 0.01, 0.3),
    },
    "ensemble.ExtraTreesClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(4, 30, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto"]),
        "criterion": hp.choice("criterion", ["entropy"]),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 4, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
    "neighbors.KNeighborsClassifier": {
        "algorithm": hp.choice("algorithm", ["ball_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 15, dtype=int)),
        "weights": hp.choice("weights", ["uniform"]),
    },
}

In [None]:
# execute bayesian optimization grid search
analysis = "titanic"
train.execBayesOptimSearch(
    allSpace=allSpace,
    resultsDir="{}_hyperopt_{}.csv".format(rundate, analysis),
    X=train.data,
    y=train.target,
    scoring="accuracy",
    n_folds=5,
    n_jobs=4,
    iters=100,
    verbose=0,
)

### Model loss by iteration

<a id = 'Model-loss-by-iteration'></a>

In [None]:
# read scores summary table
analysis = "titanic"

resultsRaw = pd.read_csv("{}_hyperopt_{}.csv".format(rundate, analysis), na_values="nan")
resultsDict = train.unpackRawParams(resultsRaw)

In [None]:
# loss plot
train.lossPlot(resultsAsDict=resultsDict)

### Parameter selection by iteration

<a id = 'Parameter-selection-by-iteration'></a>

In [None]:
# estimator parameter plots
train.paramPlot(resultsAsDict=resultsDict, allSpace=allSpace, nIter=100, chartProp=15)

In [None]:
sampleSpace = {
                'param': hp.uniform('param', np.log(0.4), np.log(0.6))
#     "": 0.000001 + hp.uniform("gamma", 0.000001, 10)
    #             'param2': hp.loguniform('param2', np.log(0.001), np.log(0.01))
}

train.samplePlot(sampleSpace, 1000)

## Model performance evaluation

<a id = 'Model-performance-evaluation'></a>

In [None]:
def topModels(resultsRaw, numModels):
    models = {}
    for estimator in resultsRaw["estimator"].unique():
        estDf = resultsRaw[resultsRaw["estimator"] == estimator].sort_values(
            ["mean"], ascending=[False]
        )["iteration"][:numModels]
        models[estimator] = estDf.values.tolist()
    return models

models = topModels(resultsRaw=resultsRaw, numModels=1)
models

### Classification report

<a id = 'Classification-report'></a>

In [None]:
# create classification reports
for estimator, ixs in models.items():
    for ix in ixs:
        # extract params and instantiate model
        params = train.bayesOptimModelBuilder(
            resultsRaw=resultsRaw, estimator=estimator, iteration=ix
        )
        model = eval("{0}(**{1})".format(estimator, params))
        
        # fit model and make predictions
        model.fit(train.data, train.target)
        yPred = model.predict(train.data)
        print('*' * 50)
        print("Model: {}\nParameter set: {}\n".format(estimator.split('.')[1], ix))
        print(metrics.classification_report(train.target, yPred, labels = [0,1]))

### Confusion matrix

<a id = 'Confusion-matrix'></a>

In [None]:
# confusion matrices
for estimator, ixs in models.items():
    for ix in ixs:
        # extract params and instantiate model
        params = train.bayesOptimModelBuilder(
            resultsRaw=resultsRaw, estimator=estimator, iteration=ix
        )
        model = eval("{0}(**{1})".format(estimator, params))
        
        # fit model and make predictions
        model.fit(train.data, train.target)
        yPred = model.predict(train.data)
        
        # visualize results with confusion matrix
        p = PrettierPlot()
        ax = p.makeCanvas(title="Model: {}\nParameter set: {}".format(estimator.split('.')[1], ix), xLabel="Predicted", yLabel="Actual", yShift=0.5, xShift = 0.35)
        p.prettyConfusionMatrix(yTrue=train.target, yPred=yPred, labels = ['Survived','Died'], ax=ax)
        

### ROC curve

<a id = 'ROC-curve'></a>

In [None]:
params['probability'] = True

In [None]:
params

In [None]:
# ROC curve
for estimator, ixs in models.items():
    for ix in ixs:
        # extract params and instantiate model
        params = train.bayesOptimModelBuilder(
            resultsRaw=resultsRaw, estimator=estimator, iteration=ix
        )
        if estimator == 'svm.SVC':
            params['probability'] = True
        
        model = eval("{0}(**{1})".format(estimator, params))
        
        # fit model and make predictions
        model.fit(train.data, train.target)
        yPred = model.predict(train.data)
        
        # plot ROC curves
        p = PrettierPlot(chartProp=12,plotOrientation="square")
        ax = p.makeCanvas(
            title="Model: {}\nParameter set: {}".format(estimator.split('.')[1], ix),
            xLabel="false positive rate",
            yLabel="true positive rate",
            yShift=0.64,
        )
        p.prettyRocCurve(
            model=model,
            XTrain=train.data,
            yTrain=train.target,
            linecolor=style.styleHexMid[0],
            ax=ax,
        )

In [None]:
# cross-validated ROC curve
for estimator, ixs in models.items():
    for ix in ixs:
        # extract params and instantiate model
        params = train.bayesOptimModelBuilder(
            resultsRaw=resultsRaw, estimator=estimator, iteration=ix
        )
        if estimator == 'svm.SVC':
            params['probability'] = True
        
        model = eval("{0}(**{1})".format(estimator, params))
        
        # fit model and make predictions
        model.fit(train.data, train.target)
        yPred = model.predict(train.data)
        
        # plot and ROC curve using only two features from the breast cancer dataset
        cv = list(
            model_selection.StratifiedKFold(n_splits=3, random_state=1).split(train.data, train.target)
        )

        # plot ROC curves
        p = PrettierPlot(chartProp=12,plotOrientation="square")
        ax = p.makeCanvas(
            title="Model: {}\nParameter set: {}".format(estimator.split('.')[1], ix),
            xLabel="false positive rate",
            yLabel="true positive rate",
            yShift=0.62,
        )
        for i, (trainIx, testIx) in enumerate(cv):
            XTrainCV = train.data.iloc[trainIx]#.values
            yTrainCV = train.target.iloc[trainIx]#.values

            p.prettyRocCurve(
                model=model,
                XTrain=XTrainCV,
                yTrain=yTrainCV,
                linecolor=style.styleHexMid[i],
                ax=ax,
            )

## Model explanability

https://www.kaggle.com/learn/machine-learning-explainability
https://www.kaggle.com/dansbecker/partial-dependence-plots

<a id = 'Feature-importance'></a>

### Permutation importance

<a id = 'Permutation-importance'></a>

In [None]:
# permutation importance
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names=val_X.columns.tolist())

### Partial plots

<a id = 'Partial-plots'></a>

In [None]:
#
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(
    model=tree_model, dataset=val_X, model_features=feature_names, feature="Goal Scored"
)

# plot it
pdp.pdp_plot(pdp_goals, "Goal Scored")
plt.show()

In [None]:
feature_to_plot = "Distance Covered (Kms)"
pdp_dist = pdp.pdp_isolate(
    model=tree_model,
    dataset=val_X,
    model_features=feature_names,
    feature=feature_to_plot,
)

pdp.pdp_plot(pdp_dist, feature_to_plot)
plt.show()

In [None]:
# Build Random Forest model
rf_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)

pdp_dist = pdp.pdp_isolate(
    model=rf_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot
)

pdp.pdp_plot(pdp_dist, feature_to_plot)
plt.show()

In [None]:
# 2D plots
# Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot
features_to_plot = ["Goal Scored", "Distance Covered (Kms)"]
inter1 = pdp.pdp_interact(
    model=tree_model,
    dataset=val_X,
    model_features=feature_names,
    features=features_to_plot,
)

pdp.pdp_interact_plot(
    pdp_interact_out=inter1, feature_names=features_to_plot, plot_type="contour"
)
plt.show()

### SHAP values

<a id = 'SHAP-values'></a>

In [None]:
#
row_to_show = 5
data_for_prediction = val_X.iloc[
    row_to_show
]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


my_model.predict_proba(data_for_prediction_array)

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)

In [None]:
# use Kernel SHAP to explain test set predictions
k_explainer = shap.KernelExplainer(my_model.predict_proba, train_X)
k_shap_values = k_explainer.shap_values(data_for_prediction)
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction)

In [None]:
shap.DeepExplainer

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(val_X)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values[1], val_X)

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# calculate shap values. This is what we will plot.
shap_values = explainer.shap_values(X)

# make plot.
shap.dependence_plot(
    "Ball Possession %", shap_values[1], X, interaction_index="Goal Scored"
)

## Stacking

<a id = 'Stacking'></a>

### Primary models

<a id = 'Primary-models'></a>

In [None]:
resultsRaw[resultsRaw["estimator"] == "xgboost.XGBClassifier"].sort_values(
    ["mean"], ascending=[False]
)[:5]

In [None]:
def topParamSelector(resultsRaw, num):
    models = {}
    for estimator in resultsRaw["estimator"].unique():
        estDf = resultsRaw[resultsRaw["estimator"] == estimator].sort_values(
            ["mean"], ascending=[False]
        )["iteration"][:num]
        models[estimator] = estDf.values.tolist()
    return models


models = topParamSelector(resultsRaw=resultsRaw, num=1)
models

In [None]:
# get out-of-fold predictions
oofTrain, oofValid, columns = train.modelStacker(
    models=models,
    resultsRaw=resultsRaw,
    XTrain=train.data.values,
    yTrain=train.target,
    XValid=valid.data.values,
    nFolds=2,
    nJobs=16,
)

In [None]:
# view correlations of predictions
p = PrettierPlot()
ax = p.makeCanvas()
p.prettyCorrHeatmap(df=pd.DataFrame(oofTrain, columns=columns), annot=True, ax=ax, vmin=0)

### Meta model

<a id = 'Meta-model'></a>

In [None]:
# parameter space
allSpace = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "linear_model.LogisticRegression": {
        "C": hp.uniform("C", 0.04, 0.1),
        "penalty": hp.choice("penalty", ["l1"]),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.AdaBoostClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "learning_rate": hp.quniform("learning_rate", 0.1, 0.25, 0.01),
        "algorithm": hp.choice("algorithm", ["SAMME"]),
    },
    "naive_bayes.BernoulliNB": {"alpha": hp.uniform("alpha", 0.01, 2)},
    "ensemble.BaggingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_samples": hp.uniform("max_samples", 0.01, 0.3),
    },
    "ensemble.ExtraTreesClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(4, 30, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto"]),
        "criterion": hp.choice("criterion", ["entropy"]),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 0.00000001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr", "ovo"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
    "neighbors.KNeighborsClassifier": {
        "algorithm": hp.choice("algorithm", ["ball_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 15, dtype=int)),
        "weights": hp.choice("weights", ["uniform"]),
    },
}

In [None]:
# execute bayesian optimization grid search
train.execBayesOptimSearch(
    allSpace=allSpace,
    resultsDir="data/{}_hyperopt_meta_{}_2.csv".format(rundate, analysis),
    X=oofTrain,
    y=train.target,
    scoring="accuracy",
    n_folds=8,
    n_jobs=8,
    iters=3000,
    verbose=0,
)

In [None]:
# read scores summary table
resultsMetaDf = pd.read_csv(
    "data/20190423_hyperopt_meta_titanic_2.csv", na_values="nan"
)
resultsMeta = train.unpackParams(resultsMetaDf)

In [None]:
# loss plot
train.lossPlot(resultsDf=resultsMeta)

In [None]:
# estimator parameter plots
train.paramPlot(results=resultsMeta, allSpace=allSpace, nIter=100)

# Submission

<a id = 'Submission'></a>

## Standard

<a id = 'Standard'></a>

In [None]:
## standard model fit and predict
# select estimator and iteration
# estimator = 'ensemble.RandomForestClassifier'
# iteration = 1955
# estimator = 'xgboost.XGBClassifier'
# iteration = 2097
estimator = "lightgbm.LGBMClassifier"
iteration = 2264

# extract params and instantiate model
params = train.bayesOptimModelBuilder(
    resultsDf=resultsDf, estimator=estimator, iteration=iteration
)
model = eval("{0}(**{1})".format(estimator, params))

# fit model and make predictions
model.fit(train.data, train.target)
yPred = model.predict(valid.data)

In [None]:
# generate prediction submission file
my_submission = pd.DataFrame({"PassengerId": dfValid.PassengerId, "Survived": yPred})
my_submission.to_csv("data/submission.csv", index=False)

## Stack

<a id = 'Stack'></a>

In [None]:
resultsMetaDf.sort_values(["mean"], ascending=[False])[:5]

In [None]:
# best second level learning model
# estimator = 'xgboost.XGBClassifier'
# estimator = 'ensemble.RandomForestClassifier'
# estimator = 'ensemble.GradientBoostingClassifier'
estimator = "svm.SVC"

iteration = 2436

# extract params and instantiate model
params = train.paramExtractor(
    resultsDf=resultsMetaDf, estimator=estimator, iteration=iteration
)
model = eval("{0}(**{1})".format(estimator, params))

model.fit(oofTrain, train.target)
yPred = model.predict(oofValid)
print(sum(yPred))

In [None]:
# generate prediction submission file
my_submission = pd.DataFrame({"PassengerId": dfValid.PassengerId, "Survived": yPred})
my_submission.to_csv("data/submission.csv", index=False)