__Kaggle competition - Titanic__

1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [Initial EDA](#Initial-EDA)
    1. [object feature EDA](#object-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target)
    1. [number feature EDA](#number-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target2)
        1. [Correlation](#Correlation)
        1. [Pair plot](#Pair-plot)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    
1. [Data preparation](#Data-preparation)
    1. [Outliers (preliminary)](#Outliers-preliminary)
        1. [Evaluate](#Evaluate)
        1. [Remove](#remove)
    1. [Missing data](#Missing-data)
        1. [Evaluate](#Evaluate1)
        1. [Impute](#Impute)
    1. [Engineering](#Engineering)
        1. [Evaluate](#Evaluate3)
        1. [Engineer](#Engineer)
    1. [Encoding](#Encoding)
        1. [Evaluate](#Evaluate2)
        1. [Encode](#Encode)
    1. [Transformation](#Transformation)
        1. [Evaluate](#Evaluate4)
        1. [Transform](#Transform)
    1. [Outliers (final)](#Outliers-final)
        1. [Evaluate](#Evaluate5)
        1. [Remove](#remove1)
1. [Data evaluation](#Data-evaluation)
    1. [Feature importance](#Feature-importance)    
    1. [Rationality](#Rationality)
    1. [Value override](#Value-override)
    1. [number feature EDA](#number-feature-EDA3)
    1. [Correlation](#Correlation3)
1. [Modeling](#Modeling)
    1. [Data preparation](#Data-preparation-1)
    1. [Bayesian hyper-parameter optimization](#Bayesian-hyper-parameter-optimization)
        1. [Model loss by iteration](#Model-loss-by-iteration)
        1. [Parameter selection by iteration](#Parameter-selection-by-iteration)
    1. [Model performance evaluation - standard models](#Model-performance-evaluation-standard-models)
    1. [Model explanability](#Model-explanability)
        1. [Permutation importance](#Permutation-importance)
        1. [SHAP values](#SHAP-values)
    1. [Submission - standard models](#Submission-standard-models)
1. [Stacking](#Stacking)
    1. [Primary models](#Primary-models)
    1. [Meta model](#Meta-model)                
    1. [Model performance evaluation - stacked models](#Model-performance-evaluation-stacked-models)
    1. [Submission - stacked models](#Submission-stacked-models)    

# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [None]:
# standard libary and settings
import os
import sys
import importlib
import itertools
import csv
import ast
from timeit import default_timer as timer

global ITERATION
import time
from functools import reduce

rundate = time.strftime("%Y%m%d")

import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# modeling extensions
import sklearn.base as base
import sklearn.cluster as cluster
import sklearn.datasets as datasets
import sklearn.decomposition as decomposition
import sklearn.discriminant_analysis as discriminant_analysis
import sklearn.ensemble as ensemble
import sklearn.feature_extraction as feature_extraction
import sklearn.feature_selection as feature_selection
import sklearn.gaussian_process as gaussian_process
import sklearn.linear_model as linear_model
import sklearn.kernel_ridge as kernel_ridge
import sklearn.metrics as metrics
import sklearn.model_selection as model_selection
import sklearn.naive_bayes as naive_bayes
import sklearn.neighbors as neighbors
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing
import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.utils as utils

import eif
import shap; shap.initjs()
import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots

from scipy import stats, special
import xgboost
import lightgbm
import catboost

from hyperopt import hp, tpe, Trials, fmin, STATUS_OK
from hyperopt.pyll.stochastic import sample

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

%matplotlib inline

try:
#     import mlmachine as mlm
#     from prettierplot.plotter import PrettierPlot
#     import prettierplot.style as style
    import asdfasd
except ModuleNotFoundError:
    sys.path.append("../../../mlmachine") if "../../../../mlmachine" not in sys.path else None
    sys.path.append("../../../prettierplot") if "../../../../prettierplot" not in sys.path else None
    
    import mlmachine as mlm
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
else:
    print('This notebook relies on the libraries mlmachine and prettierplot. Please run:')
    print('\tpip install mlmachine')
    print('\tpip install prettierplot')

## Data

<a id = 'Data'></a>

In [None]:
# load data and print dimensions
df_train = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic//train.csv")
df_valid = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic//test.csv")

print("Training data dimensions: {}".format(df_train.shape))
print("Validation data dimensions: {}".format(df_valid.shape))

In [None]:
# display info and first 5 rows
df_train.info()
display(df_train[:5])

In [None]:
# review counts of different column types
df_train.dtypes.value_counts()

In [None]:
# Load training data into mlmachine
train = mlm.Machine(
    data=df_train,
    target="Survived",
    remove_features=["PassengerId", "Ticket"],
    force_to_object=["Pclass", "SibSp", "Parch"],
    target_type="object",
)
print(train.data.shape)

In [None]:
# load training data into mlmachine
valid = mlm.Machine(
    data=df_valid,
    remove_features=["PassengerId", "Ticket"],
    force_to_object=["Pclass", "SibSp", "Parch"],
)
print(valid.data.shape)

# Initial EDA

<a id = 'Initial-EDA'></a>

## object feature EDA

<a id = 'object-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target'></a>

In [None]:
# object features
for feature in train.feature_type["object"]:
    train.eda_cat_target_cat_feat(feature=feature, level_count_cap=50)

## number feature EDA

<a id = 'number-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target2'></a>

In [None]:
# number features
for feature in train.feature_type["number"]:
    train.eda_cat_target_num_feat(feature=feature)

### Correlation

<a id = 'Correlation'></a>

##### Correlation (all samples)

In [None]:
# correlation heat map
p = PrettierPlot()
ax = p.make_canvas()
p.pretty_corr_heatmap(df=train.data, annot=True, ax=ax)

##### Correlation (top vs. target)

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plot_orientation='tall',chart_prop=10)
ax = p.make_canvas()
p.pretty_corr_heatmap_target(
    df=train.data, target=train.target, thresh=0.01, annot=True, ax=ax
)

> Remarks - There are three pairs of highly correlated features:
    - 'GarageArea' and 'GarageCars'
    - 'TotRmsAbvGrd' and 'GrLivArea'
    - '1stFlrSF' and 'TotalBsmtSF
This makes sense, given what each feature represents and how each pair items relate to each other. We likely only need one feature from each pair.

### Pair plot

<a id = 'Pair-plot'></a>

In [None]:
# pair plot
p = PrettierPlot(chart_prop=12)
p.pretty_pair_plot(df=train.data, diag_kind="auto")

In [None]:
# pair plot
p = PrettierPlot(chart_prop=12)
p.pretty_pair_plot(
    df=train.data.dropna(),
    diag_kind="kde",
    target=train.target,
    columns=["Age", "Fare", "Pclass", "Parch", "SibSp"],
    legend_labels=["Died", "Survived"],
    bbox=(2.0, 0.0),
)

## Faceting

<a id = 'Faceting'></a>

##### object by object

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_prop=12)
ax = p.make_canvas(title="Survivorship, embark location by passenger class", y_shift=0.7)
p.pretty_facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="Embarked",
    y=train.target.name,
    split="Pclass",
    y_units="ff",
    ax=ax,
)

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_prop=12)
ax = p.make_canvas(title="Survivorship, passenger class by gender", y_shift=0.7)
p.pretty_facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="Pclass",
    y=train.target.name,
    split="Sex",
    y_units="ff",
    ax=ax,
)

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_prop=12)
ax = p.make_canvas(title="Survivorship,embark location by gender", y_shift=0.7)
p.pretty_facet_two_cat_bar(
    df=train.recombine_data(train.data, train.target),
    x="Embarked",
    y=train.target.name,
    split="Sex",
    y_units="ff",
    ax=ax,
)

In [None]:
#
p = PrettierPlot()
p.pretty_facet_two_cat_point(
    df=train.recombine_data(train.data, train.target),
    x="Sex",
    y=train.target.name,
    split="Pclass",
    cat_row="Embarked",
    aspect=1.0,
    height=5,
    bbox=(1.3, 1.2),
    legend_labels=["1st class", "2nd class", "3rd class"],
)

In [None]:
#
p = PrettierPlot()
p.pretty_facet_two_cat_point(
    df=train.recombine_data(train.data, train.target).dropna(subset=["Embarked"]),
    x="Embarked",
    y=train.target.name,
    split="Pclass",
    cat_row="Sex",
    aspect=1.0,
    height=5,
    bbox=(1.5, 0.8),
    legend_labels=["1st class", "2nd class", "3rd class"],
)

##### object by number

In [None]:
#
p = PrettierPlot()
p.pretty_facet_cat_num_hist(
    df=train.recombine_data(train.data, train.target),
    split=train.target.name,
    legend_labels=["Died", "Lived"],
    cat_row="Sex",
    cat_col="Embarked",
    num_col="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)

In [None]:
#
p = PrettierPlot(chart_prop=15)
p.pretty_facet_cat_num_scatter(
    df=train.recombine_data(train.data, train.target),
    split=train.target.name,
    legend_labels=["Died", "Lived"],
    cat_row="Sex",
    cat_col="Embarked",
    xNum="Fare",
    yNum="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)

## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# null score
pd.Series(train.target).value_counts(normalize=True)

# Data preparation

<a id = 'Data-preparation'></a>

## Outliers (preliminary)


<a id = 'Outliers-preliminary'></a>

### Evaluate

<a id = 'Evaluate'></a>

##### Training outliers

In [None]:
# identify columns that have zero missing values
nonNull = train.data.columns[train.data.isnull().sum() == 0].values.tolist()

# identify intersection between non-null columns and number columns
nonNullnum_col = list(set(nonNull).intersection(train.feature_type["number"]))
print(nonNullnum_col)

In [None]:
# identify outliers using IQR
train_pipe = pipeline.Pipeline([
    ("outlier",train.OutlierIQR(
                outlier_count=2,
                iqr_step=1.5,
                features=["Age", "SibSp", "Parch", "Fare"],
                drop_outliers=False,))
    ])
train.data = train_pipe.transform(train.data)

# capture outliers
iqr_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers_))
print(iqr_outliers)

In [None]:
# identify outliers using Isolation Forest
clf = ensemble.IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.02
)
clf.fit(train.data[["SibSp", "Parch", "Fare"]])
preds = clf.predict(train.data[["SibSp", "Parch", "Fare"]])

# evaluate index values
mask = np.isin(preds, -1)
if_outliers = np.array(train.data[mask].index)
print(if_outliers)

In [None]:
# identify outliers using extended isolation forest
train_pipe = pipeline.Pipeline([
    ("outlier",train.ExtendedIsoForest(
                columns=["SibSp", "Parch", "Fare"],
                n_trees=100,
                sample_size=256,
                ExtensionLevel=1,
                anomalies_ratio=0.03,
                drop_outliers=False,))
    ])
train.data = train_pipe.transform(train.data)

# capture outliers
eif_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers_))
print(eif_outliers)

In [None]:
# identify outliers that are identified in multiple algorithms
# reduce(np.intersect1d, (iqr_outliers, if_outliers, eif_outliers))
outliers = reduce(np.intersect1d, (if_outliers, eif_outliers))
print(outliers)

In [None]:
# review outlier identification summary
outlier_summary = train.outlier_summary(iqr_outliers=iqr_outliers,
                             if_outliers=if_outliers,
                             eif_outliers=eif_outliers
                            )
outlier_summary

##### Validation outliers

### Remove

<a id = 'remove'></a>

In [None]:
# remove outlers from predictors and response
outliers = np.array([27, 88, 258, 311, 341, 438, 679, 737, 742])
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

## Missing data


<a id = 'Missing-data'></a>

### Evaluate

<a id = 'Evaluate1'></a>

##### Training missingness

In [None]:
# evaluate missing data
train.eda_missing_summary()

In [None]:
# missingno matrix
msno.matrix(train.data)

In [None]:
# missingno bar
msno.bar(train.data)

In [None]:
# missingno heatmap
msno.heatmap(train.data)

In [None]:
# missingno dendrogram
msno.dendrogram(train.data)

##### Validation missingness

In [None]:
# evaluate missing data
valid.eda_missing_summary()

In [None]:
# missingno matrix
msno.matrix(valid.data)

In [None]:
# missingno bar
msno.bar(valid.data)

In [None]:
# missingno heatmap
msno.heatmap(valid.data)

In [None]:
# missingno dendrogram
msno.dendrogram(valid.data)


##### Training vs. validation missingness


In [None]:
# compare feature with missing data
train.missing_col_compare(train=train.data, validation=valid.data)

### Impute

<a id = 'Impute'></a>

##### Impute training

In [None]:
# apply imputations to missing data in training dataset
train_pipe = pipeline.Pipeline([
        ("imputeMedian",train.ContextImputer(null_col="Age", context_col="Parch", strategy="median")),
        ("imputeMode", train.ModeImputer(columns=["Embarked"])),
    ])
train.data = train_pipe.transform(train.data)
train.eda_missing_summary()

##### Impute validation

In [None]:
# apply imputations to missing data in validation dataset
validPipe = pipeline.Pipeline([
        ("imputeMedian",valid.ContextImputer(null_col="Age",context_col="Parch",train=False,trainValue=train_pipe.named_steps["imputeMedian"].trainValue_,)),
        ("imputeMedian2",valid.numberalImputer(columns=["Fare", "Age"], strategy="median")),
    ])
valid.data = validPipe.transform(valid.data)
valid.eda_missing_summary()

## Engineering

<a id = 'Engineering'></a>

### Evaluate

<a id = 'Evaluate3'></a>

### Engineer

<a id = 'Engineer'></a>

##### Engineer training

In [None]:
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in train.data["Name"]]
train.data["Title"] = pd.Series(title)
train.data["Title"] = train.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
train.data["Title"] = train.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)

# distill cabin feature
train.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in train.data["Cabin"]]
)

# family size features and binning
train.data["FamilySize"] = train.data["SibSp"] + train.data["Parch"] + 1

customBinDict = {"Age": [16, 32, 48, 64], "FamilySize": [1, 2, 4]}

train_pipe = pipeline.Pipeline([
        ("customBin", train.CustomBinner(customBinDict=customBinDict)),
        ("percentileBin",train.PercentileBinner(columns=["Age", "Fare"], percs=[25, 50, 75])),
    ])
train.data = train_pipe.transform(train.data)

# drop features
train.data, train.feature_type = train.featureDropper(
    columns=["Name", "Cabin"], data=train.data, feature_type=train.feature_type
)

In [None]:
# print new columns
for col in train.data.columns:
    if (
        col not in train.feature_type["object"]
        and col not in train.feature_type["number"]
    ):
        print(col)

In [None]:
# append new number features
for col in ["FamilySize"]:
    train.feature_type["number"].append(col)

# append new object features
for col in [
    "AgeCustomBin",
    "AgePercBin",
    "FarePercBin",
    "FamilySize",
    "FamilySizeCustomBin",
    "Title",
    "CabinQuarter",
]:
    train.feature_type["object"].append(col)

In [None]:
# evaluate additional features
for feature in train.feature_type['object']:
    train.eda_cat_target_cat_feat(feature=feature)

##### Engineer validation

In [None]:
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in valid.data["Name"]]
valid.data["Title"] = pd.Series(title)
valid.data["Title"] = valid.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
valid.data["Title"] = valid.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)

# distill cabin feature
valid.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in valid.data["Cabin"]]
)

# additional features
valid.data["FamilySize"] = valid.data["SibSp"] + valid.data["Parch"] + 1

validPipe = pipeline.Pipeline([
        ("customBin", valid.CustomBinner(customBinDict=customBinDict)),
        ("percentileBin",valid.PercentileBinner(train=False, trainValue=train_pipe.named_steps["percentileBin"].trainValue_)),
    ])
valid.data = validPipe.transform(valid.data)

# drop features
valid.data, valid.feature_type = valid.featureDropper(
    columns=["Name", "Cabin"], data=valid.data, feature_type=valid.feature_type
)

In [None]:
# print new columns
for col in valid.data.columns:
    if (
        col not in valid.feature_type["object"]
        and col not in valid.feature_type["number"]
    ):
        print(col)

In [None]:
# append new number features
for col in ["FamilySize"]:
    valid.feature_type["number"].append(col)

# append new object features
for col in [
    "AgeCustomBin",
    "AgePercBin",
    "FarePercBin",
    "FamilySize",
    "FamilySizeCustomBin",
    "Title",
    "CabinQuarter",
]:
    valid.feature_type["object"].append(col)

## Encoding

<a id = 'Encoding'></a>

### Evaluate

<a id = 'Evaluate2'></a>

##### Training feature evaluation

In [None]:
# counts of unique values in training data string columns
train.data[train.feature_type["object"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each object columns
for col in train.data[train.feature_type["object"]]:
    try:
        print(col, np.unique(train.data[col]))
    except:
        pass

##### Validation feature evaluation

In [None]:
# counts of unique values in validation data string columns
valid.data[valid.feature_type["object"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each object columns
for col in valid.data[valid.feature_type["object"]]:
    if col not in ["Name", "Cabin"]:
        print(col, np.unique(valid.data[col]))

##### Training vs. validation

In [None]:
# identify values that are present in the training data but not the validation data, and vice versa
for col in train.feature_type["object"]:
    if col not in ["Name", "Cabin"]:
        train_values = train.data[col].unique()
        valid_values = valid.data[col].unique()

        train_diff = set(train_values) - set(valid_values)
        valid_diff = set(valid_values) - set(train_values)

        if len(train_diff) > 0 or len(valid_diff) > 0:
            print("\n\n*** " + col)
            print("Value present in training data, not in validation data")
            print(train_diff)
            print("Value present in validation data, not in training data")
            print(valid_diff)

### Encode

<a id = 'Encode'></a>

##### Encode training

In [None]:
# ordinal column encoding instructions
ordobject_columns = {"Pclass": {1: 1, 2: 2, 3: 3}}

# nominal columns
nomobject_columns = ["Embarked", "Sex", "CabinQuarter", "Title"]

# apply encodings to training data
train_pipe = pipeline.Pipeline(
    [
        ("encodeOrdinal", train.CustomOrdinalEncoder(encodings=ordobject_columns)),
        ("dummyNominal", train.Dummies(columns=nomobject_columns, dropFirst=True)),
    ]
)
train.data = train_pipe.transform(train.data)
train.data[:5]

##### Encode validation

In [None]:
# apply encodings to validation data
validPipe = pipeline.Pipeline(
    [
        ("encodeOrdinal", valid.CustomOrdinalEncoder(encodings=ordobject_columns)),
        ("dummyNominal", valid.Dummies(columns=nomobject_columns, dropFirst=False)),
        ("sync", valid.FeatureSync(trainCols=train.data.columns)),
    ]
)
valid.data = validPipe.transform(valid.data)
valid.data[:5]

## Transformation

<a id = 'Transformation'></a>

### Evaluate

<a id = 'Evaluate4'></a>

##### Training feature transformation

In [None]:
# evaluate skew of number features - training data
train.skew_summary()

##### Validation feature transformation

In [None]:
# evaluate skew of number features - validation data
valid.skew_summary()

### Transform

<a id = 'Transform'></a>

##### Transform training

In [None]:
# skew correct in training dataset, which also learns te best lambda value for each columns
train_pipe = pipeline.Pipeline([
        ("skew",train.SkewTransform(columns=train.feature_type["number"], skewMin=0.75, pctZeroMax=1.0, verbose = True))
    ])
train.data = train_pipe.transform(train.data)
train.skew_summary()

##### Transform validation

In [None]:
# skew correction in validation dataset using lambdas learned on training data
validPipe = pipeline.Pipeline([
        ("skew",valid.SkewTransform(train=False, trainValue=train_pipe.named_steps["skew"].trainValue_))
    ])
valid.data = validPipe.transform(valid.data)
valid.skew_summary()

## Outliers (final)


<a id = 'Outliers-final'></a>

### Evaluate

<a id = 'Evaluate5'></a>

In [None]:
# identify outliers using IQR
train_pipe = pipeline.Pipeline([
    ("outlier",train.OutlierIQR(
                outlier_count=5,
                iqr_step=1.5,
                features=train.data.columns,
                drop_outliers=False,))
    ])
train.data = train_pipe.transform(train.data)

# capture outliers
iqr_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers_))
print(iqr_outliers)

In [None]:
# identify outliers using Isolation Forest
clf = ensemble.IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.01
)
clf.fit(train.data[train.data.columns])
preds = clf.predict(train.data[train.data.columns])

# evaluate index values
mask = np.isin(preds, -1)
if_outliers = np.array(train.data[mask].index)
print(if_outliers)

In [None]:
# identify outliers using extended isolation forest
train_pipe = pipeline.Pipeline([
    ("outlier",train.ExtendedIsoForest(
                columns=train.data.columns,
                n_trees=100,
                sample_size=256,
                ExtensionLevel=1,
                anomalies_ratio=0.03,
                drop_outliers=False,))
    ])
train.data = train_pipe.transform(train.data)

# capture outliers
eif_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers_))
print(eif_outliers)

In [None]:
# identify outliers that are identified in multiple algorithms
outliers = reduce(np.intersect1d, (iqr_outliers, if_outliers, eif_outliers))
# outliers = reduce(np.intersect1d, (if_outliers, eif_outliers))
print(outliers)

In [None]:
# review outlier identification summary
outlier_summary = train.outlier_summary(iqr_outliers=iqr_outliers,
                             if_outliers=if_outliers,
                             eif_outliers=eif_outliers
                            )
outlier_summary

### Remove

<a id = 'remove1'></a>

In [None]:
# # remove outlers from predictors and response
# outliers = np.array([59,121])
# train.data = train.data.drop(outliers)
# train.target = train.target.drop(index=outliers)

# Data evaluation

<a id = 'Data evaluation'></a>

## Feature importance

<a id = 'Feature-importance'></a>

In [None]:
# generate feature importance summary
estimators = [
    "lightgbm.LGBMClassifier",
    "ensemble.RandomForestClassifier",
    "ensemble.GradientBoostingClassifier",
    "ensemble.ExtraTreesClassifier",
    "ensemble.AdaBoostClassifier",
    "xgboost.XGBClassifier",
]

featureSummary = train.feature_selector_suite(estimators=estimators)

In [None]:
# calculate cross-validation performance
estimators = [
    "svm.SVC",
    "lightgbm.LGBMClassifier",
    "linear_model.LogisticRegression",
    "xgboost.XGBClassifier",
    "ensemble.RandomForestClassifier",
    "ensemble.GradientBoostingClassifier",
    "ensemble.AdaBoostClassifier",
    "ensemble.ExtraTreesClassifier",
    "neighbors.KNeighborsClassifier",
]

cv_summary = train.feature_selector_cross_val(
    estimators=estimators,
    featureSummary=featureSummary,
    metrics=["accuracy","f1_macro","roc_auc"],
    n_folds=8,
)

In [None]:
# visualize CV performance for diminishing feature set
train.feature_selector_results_plot(
    cv_summary=cv_summary,
    featureSummary=featureSummary,
    metric="accuracy",
    show_features=True,
)

In [None]:
df = train.features_used_summary(
    cv_summary=cv_summary, metric="accuracy", featureSummary=featureSummary
)
df

## Rationality

<a id = 'Rationality'></a>

In [None]:
# percent difference summary
df_diff = abs(
    (
        ((valid.data.describe() + 1) - (train.data.describe() + 1))
        / (train.data.describe() + 1)
    )
    * 100
)
df_diff = df_diff[df_diff.columns].replace({0: np.nan})
df_diff[df_diff < 0] = np.nan
df_diff = df_diff.fillna("")
display(df_diff)
display(train.data[df_diff.columns].describe())
display(valid.data[df_diff.columns].describe())

## Value override

<a id = 'Value override'></a>

In [None]:
# change clearly erroneous value to what it probably was
# exploreValid.data['GarageYrBlt'].replace({2207 : 2007}, inplace = True)

## number feature EDA

<a id = 'number-feature-EDA3'></a>

## Correlation

<a id = 'Correlation3'></a>

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(chart_prop=15)
ax = p.make_canvas()
p.pretty_corr_heatmap_target(df=train.data, target=train.target, thresh=0.2, ax=ax)

# Modeling

<a id = 'Modeling'></a>

## Data preparation

<a id = 'Data-preparation-1'></a>

##### Prepare training data

In [None]:
# import training data
df_train = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic//train.csv")
train = mlm.Machine(
    data=df_train,
    target="Survived",
    remove_features=["PassengerId", "Ticket"],
    force_to_object=["Pclass", "SibSp", "Parch"],
    target_type="object",
)

### feature engineering
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in train.data["Name"]]
train.data["Title"] = pd.Series(title)
train.data["Title"] = train.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
train.data["Title"] = train.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)

# distill cabin feature
train.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in train.data["Cabin"]]
)

# family size features
train.data["FamilySize"] = train.data["SibSp"] + train.data["Parch"] + 1

# custom bin specifications
customBinDict = {"Age": [16, 32, 48, 64], "FamilySize": [1, 2, 4]}
# object column specifications
ordobject_columns = {"Pclass": {1: 1, 2: 2, 3: 3}}
nomobject_columns = ["Embarked", "Sex", "CabinQuarter", "Title"]

# remove outliers
outliers = np.array([27, 88, 258, 311, 341, 438, 679, 737, 742])
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

### pipeline
train_pipe = pipeline.Pipeline([
        ('imputeMedian', train.ContextImputer(null_col = 'Age', context_col = 'Parch', strategy = 'median')),
        ('imputeMode', train.ModeImputer(columns = ['Embarked'])),
        ('customBin', train.CustomBinner(customBinDict = customBinDict)),
        ('percentileBin', train.PercentileBinner(columns = ['Age','Fare'], percs = [10, 25, 50, 75, 90])),
        ('encodeOrdinal', train.CustomOrdinalEncoder(encodings = ordobject_columns)),
        ('dummyNominal', train.Dummies(columns = nomobject_columns, dropFirst = True)),
        ('skew', train.SkewTransform(columns = train.feature_type['number'], skewMin = 0.75, pctZeroMax = 1.0)),
    ])
train.data = train_pipe.transform(train.data)

# drop features
train.data, train.feature_type = train.featureDropper(
    columns=["Name", "Cabin"], data=train.data, feature_type=train.feature_type
)
print('completed')

##### Prepare validation data

In [None]:
### import valid data
df_valid = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic//test.csv")
valid = mlm.Machine(
    data=df_valid,
    remove_features=["PassengerId", "Ticket"],
    force_to_object=["Pclass", "SibSp", "Parch"],
)

### feature engineering
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in valid.data["Name"]]
valid.data["Title"] = pd.Series(title)
valid.data["Title"] = valid.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
valid.data["Title"] = valid.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)

# distill cabin feature
valid.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in valid.data["Cabin"]]
)

# additional features
valid.data["FamilySize"] = valid.data["SibSp"] + valid.data["Parch"] + 1

### pipeline
validPipe = pipeline.Pipeline(
    [
        ("imputeMedian",valid.ContextImputer(null_col="Age",context_col="Parch",train=False,trainValue=train_pipe.named_steps["imputeMedian"].trainValue_)),
        ("imputeMedian2",valid.numberalImputer(columns=["Fare", "Age"], strategy="median",train=False,trainValue=train.data)),
        ("customBin", valid.CustomBinner(customBinDict=customBinDict)),
        ("percentileBin",valid.PercentileBinner(train=False, trainValue=train_pipe.named_steps["percentileBin"].trainValue_)),
        ("encodeOrdinal", valid.CustomOrdinalEncoder(encodings=ordobject_columns)),
        ("dummyNominal", valid.Dummies(columns=nomobject_columns, dropFirst=False)),
        ("sync", valid.FeatureSync(trainCols=train.data.columns)),
        ("skew",valid.SkewTransform(train=False, trainValue=train_pipe.named_steps["skew"].trainValue_)),
    ]
)
valid.data = validPipe.transform(valid.data)
print('completed')

## Bayesian hyper-parameter optimization

<a id = 'Bayesian-hyper-parameter-optimization'></a>

In [None]:
# parameter space
all_space = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "linear_model.LogisticRegression": {
        "C": hp.uniform("C", 0.04, 0.1),
        "penalty": hp.choice("penalty", ["l1"]),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.AdaBoostClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "learning_rate": hp.quniform("learning_rate", 0.1, 0.25, 0.01),
        "algorithm": hp.choice("algorithm", ["SAMME"]),
    },
    "ensemble.ExtraTreesClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(4, 30, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto"]),
        "criterion": hp.choice("criterion", ["entropy"]),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 4, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
    "neighbors.KNeighborsClassifier": {
        "algorithm": hp.choice("algorithm", ["ball_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 15, dtype=int)),
        "weights": hp.choice("weights", ["uniform"]),
    },
}

In [None]:
# execute bayesian optimization grid search
analysis = "titanic"
train.exec_bayes_optim_search(
    all_space=all_space,
    results_dir="{}_hyperopt_{}.csv".format(rundate, analysis),
    X=train.data,
    y=train.target,
    scoring="accuracy",
    n_folds=2,
    n_jobs=4,
    iters=8,
    verbose=0,
)

### Model loss by iteration

<a id = 'Model-loss-by-iteration'></a>

In [None]:
# read scores summary table
analysis = "titanic"
rundate = "20190807"
bayes_optim_summary = pd.read_csv(
    "{}_hyperopt_{}.csv".format(rundate, analysis), na_values="nan"
)
bayes_optim_summary[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayes_optim_summary["estimator"]):
    train.model_loss_plot(bayes_optim_summary=bayes_optim_summary, estimator=estimator)

### Parameter selection by iteration

<a id = 'Parameter-selection-by-iteration'></a>

In [None]:
# estimator parameter plots
for estimator in np.unique(bayes_optim_summary["estimator"]):
    train.modelParamPlot(
        bayes_optim_summary=bayes_optim_summary,
        estimator=estimator,
        all_space=all_space,
        n_iter=100,
        chart_prop=15,
    )

In [None]:
sample_space = {
                'param': hp.uniform('param', np.log(0.4), np.log(0.6))
#     "": 0.000001 + hp.uniform("gamma", 0.000001, 10)
    #             'param2': hp.loguniform('param2', np.log(0.001), np.log(0.01))
}

train.sample_plot(sample_space, 1000)

In [None]:
# pair-wise comparison
p = PrettierPlot(chart_prop=12)
p.pretty_pair_plot_custom(
    df=df,
    columns=["colsample_bytree", "learning_rate", "iteration","iterLoss"],
    gradient_col="iteration",
)


## Model performance evaluation - standard models

<a id = 'Model-performance-evaluation-standard-models'></a>

In [None]:
top_models = train.top_bayes_optim_models(bayes_optim_summary=bayes_optim_summary, num_models=1)
top_models

In [None]:
# classification panel, single model
estimator = "svm.SVC"
model_iter = 135
# estimator = 'ensemble.GradientBoostingClassifier'; model_iter = 590
# estimator = 'xgboost.XGBClassifier'; model_iter = 380

model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)

train.classification_panel(
    model=model,
    X_train=train.data,
    y_train=train.target,
    cm_labels=["Dies", "Survives"],
    n_folds=5,
)

In [None]:
# create classification reports for training data
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = train.BayesOptimModelBuilder(
            bayes_optim_summary=bayes_optim_summary,
            estimator=estimator,
            model_iter=model_iter,
        )
        train.classification_panel(
            model=model, X_train=train.data, y_train=train.target, cm_labels=['Dies', 'Survives'], n_folds=4
        )

## Model explanability

<a id = 'Feature-importance'></a>

In [None]:
# 
# estimator = "ensemble.ExtraTreesClassifier"; model_iter = 145
# estimator = "svm.SVC"; model_iter = 135
estimator = "ensemble.GradientBoostingClassifier"; model_iter = 490

model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)
model.fit(train.data.values, train.target.values)

### Permutation importance

<a id = 'Permutation-importance'></a>

In [None]:
# permutation importance - how much does performance decrease when shuffling a certain feature?
perm = PermutationImportance(model.model, random_state=1).fit(train.data, train.target)
eli5.show_weights(perm, feature_names=train.data.columns.tolist())

### SHAP values

<a id = 'SHAP-values'></a>

##### Training

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in train.data.index[:2]:
    train.single_shap_viz_tree(obsIx=i, model=model, data=train.data, target=train.target)

###### Force plots - multiple observations

In [None]:
# SHAP force plot a set of data
visual = train.multi_shap_viz_tree(obs_ixs=train.data.index, model=model, data=train.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = train.multi_shap_value_tree(
    obs_ixs=train.data.index, model=model, data=train.data
)

In [None]:
# SHAP dependence plot grid
grid_features = ["Pclass", "Age", "Fare", "SibSp","Parch"]

train.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=train.data.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

train.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="Age",
    color_feature="Parch",
    feature_names=train.data.columns,
    dot_size=50,
    alpha=0.5,
    ax=ax
)


In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = train.data.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()
    
    train.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Age",
        feature_names=feature_names,
        dot_size=50,
        alpha=0.5,
        ax=ax,
    )

###### Summary plots

In [None]:
# SHAP summary plot
train.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=train.data.columns,
    )

##### Validation

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in valid.data.index[:2]:
    valid.single_shap_viz_tree(obsIx=i, model=model, data=valid.data)

###### Force plots - multiple observations

In [None]:
# SHAP force plot a set of data
visual = valid.multi_shap_viz_tree(obs_ixs=valid.data.index, model=model, data=valid.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = valid.multi_shap_value_tree(
    obs_ixs=valid.data.index, model=model, data=valid.data
)

In [None]:
# SHAP dependence plot grid
grid_features = ["Pclass", "Age", "Fare", "SibSp","Parch"]

valid.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=valid.data.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

valid.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="Age",
    color_feature="Parch",
    feature_names=valid.data.columns,
    dot_size=50,
    alpha=0.5,
    ax=ax
)


In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = valid.data.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()
    
    valid.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Age",
        feature_names=feature_names,
        dot_size=50,
        alpha=0.5,
        ax=ax,
    )

###### Summary plots

In [None]:
# SHAP summary plot
valid.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=valid.data.columns,
    )

## Submission - standard models

<a id = 'Submission-standard-models'></a>

In [None]:
## standard model fit and predict
# select estimator and iteration
# estimator = "lightgbm.LGBMClassifier"; model_iter = 668  #142 survived, 0.77511
# estimator = "xgboost.XGBClassifier"; model_iter = 380  #151 survived, 0.7655
# estimator = "ensemble.RandomForestClassifier"; model_iter = 405  #148 survived, 0.79425
# estimator = "ensemble.GradientBoostingClassifier"; model_iter = 590  #142 survived, 0.7655
estimator = "svm.SVC"; model_iter = 135  #154 survived, 0.755

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)
model.fit(train.data.values, train.target.values)

# fit model and make predictions
y_pred = model.predict(valid.data.values)
print(sum(y_pred))

In [None]:
# generate prediction submission file
submit = pd.DataFrame({"PassengerId": df_valid.PassengerId, "Survived": y_pred})
submit.to_csv("submission.csv", index=False)

# Stacking

<a id = 'Stacking'></a>

## Primary models

<a id = 'Primary-models'></a>

In [None]:
# get out-of-fold predictions
oof_train, oof_valid, columns = train.model_stacker(
    models=top_models,
    bayes_optim_summary=bayes_optim_summary,
    X_train=train.data.values,
    y_train=train.target.values,
    X_valid=valid.data.values,
    n_folds=10,
    n_jobs=10,
)

In [None]:
# view correlations of predictions
p = PrettierPlot()
ax = p.make_canvas()
p.pretty_corr_heatmap(
    df=pd.DataFrame(oof_train, columns=columns), annot=True, ax=ax, vmin=0
)

## Meta model

<a id = 'Meta-model'></a>

In [None]:
# parameter space
all_space = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 0.00000001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr", "ovo"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
}

In [None]:
# execute bayesian optimization grid search
train.exec_bayes_optim_search(
    all_space=all_space,
    results_dir="{}_hyperopt_meta_{}.csv".format(rundate, analysis),
    X=oof_train,
    y=train.target,
    scoring="accuracy",
    n_folds=8,
    n_jobs=10,
    iters=1000,
    verbose=0,
)

In [None]:
# read scores summary table
analysis = "Titanic"
rundate = "20190807"
bayes_optim_summary_meta = pd.read_csv("{}_hyperopt_meta_{}.csv".format(rundate, analysis))
bayes_optim_summary_meta[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayes_optim_summary_meta["estimator"]):
    train.model_loss_plot(bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator)

In [None]:
# estimator parameter plots
for estimator in np.unique(bayes_optim_summary_meta["estimator"]):
    train.modelParamPlot(
        bayes_optim_summary=bayes_optim_summary_meta,
        estimator=estimator,
        all_space=all_space,
        n_iter=100,
        chart_prop=15,
    )

## Model performance evaluation - stacked models

<a id = 'Model-performance-evaluation-stacked-models'></a>

In [None]:
top_models = train.top_bayes_optim_models(
    bayes_optim_summary=bayes_optim_summary_meta, num_models=1
)
top_models

In [None]:
# classification panel, single model
estimator = "svm.SVC"; model_iter = 135
# estimator = 'ensemble.GradientBoostingClassifier'; model_iter = 590
# estimator = 'xgboost.XGBClassifier'; model_iter = 380

model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)

train.classification_panel(
    model=model, X_train=oof_train, y_train=train.target, labels=[0, 1], n_folds=4
)

In [None]:
# create classification reports for training data
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = train.BayesOptimModelBuilder(
            bayes_optim_summary=bayes_optim_summary_meta,
            estimator=estimator,
            model_iter=model_iter,
        )
        train.classification_panel(
            model=model, X_train=oof_train, y_train=train.target, labels=[0, 1], n_folds=4
        )

## Submission - stacked models

<a id = 'Submission-stacked-models'></a>

In [None]:
# best second level learning model
# estimator = "lightgbm.LGBMClassifier"; model_iter = 876 #0.75119
# estimator = "xgboost.XGBClassifier"; model_iter = 821, #0.779
# estimator = "ensemble.RandomForestClassifier"; model_iter = 82 
# estimator = "ensemble.GradientBoostingClassifier"; model_iter = 673 #0.77511
estimator = "svm.SVC"; model_iter = 538 # 0.77511

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)

model.fit(oof_train, train.target.values)
y_pred = model.predict(oof_valid)
print(sum(y_pred))

In [None]:
# generate prediction submission file
submit = pd.DataFrame({"PassengerId": df_valid.PassengerId, "Survived": y_pred})
submit.to_csv("submission.csv", index=False)