__Kaggle competition - Titanic survivorship__

1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [EDA](#EDA)
    1. [Categorical feature EDA](#Categorical-feature-EDA)
    1. [numeric feature EDA](#numeric-feature-EDA)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    
1. [Data preparation](#Data-preparation)
    1. [Missing data](#Missing-data)
    1. [Engineering](#Engineering)
    1. [Encoding](#Encoding)
    1. [Transformation](#Transformation)
        1. [Polynomial features](#Polynomial-features)
        1. [Skew](#Skew)
        1. [Scale](#Scale)
    1. [Outliers](#Outliers)
1. [Feature importance](#Feature-importance)    
1. [Modeling](#Modeling)
    1. [Data preparation](#Data-preparation-1)
    1. [Bayesian hyper-parameter optimization](#Bayesian-hyper-parameter-optimization)
    1. [Model performance evaluation - standard models](#Model-performance-evaluation-standard-models)
    1. [Model explanability](#Model-explanability)
    1. [Submission - standard models](#Submission-standard-models)
1. [Stacking](#Stacking)
    1. [Primary models](#Primary-models)
    1. [Meta model](#Meta-model)                
    1. [Model performance evaluation - stacked models](#Model-performance-evaluation-stacked-models)
    1. [Submission - stacked models](#Submission-stacked-models)    

# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [None]:
# standard libary and settings
import os
import sys
import importlib
import itertools
from functools import reduce
import time; rundate = time.strftime("%Y%m%d")

import warnings
warnings.simplefilter("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np
np.set_printoptions(threshold=np.inf, suppress=True)

import pandas as pd
pd.set_option("display.max_rows", 500); pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# modeling extensions
import sklearn.base as base
import sklearn.datasets as datasets
import sklearn.ensemble as ensemble
import sklearn.impute as impute
import sklearn.linear_model as linear_model
import sklearn.neighbors as neighbors
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing
import sklearn.svm as svm

import lightgbm
import xgboost

from hyperopt import hp

import eif
import shap
shap.initjs()
# from eli5.sklearn import PermutationImportance
# from pdpbox import pdp, get_dataset, info_plots

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import category_encoders as ce

%matplotlib inline

try:
    #     import mlmachine as mlm
    #     from prettierplot.plotter import PrettierPlot
    #     import prettierplot.style as style
    import asdfasd
except ModuleNotFoundError:
    sys.path.append(
        "../../../mlmachine"
    ) if "../../../../mlmachine" not in sys.path else None
    sys.path.append(
        "../../../prettierplot"
    ) if "../../../../prettierplot" not in sys.path else None

    import mlmachine as mlm
    import mlmachine.data as data
    from mlmachine.features.preprocessing import (
        DataFrameSelector,
        PlayWithPandas,
        UnprocessedColumnAdder,
        KFoldTargetEncoderTrain,
        ContextImputer,
        PandasFeatureUnion,
        DualTransformer,
    )
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
else:
    print(
        "This notebook relies on the libraries mlmachine and prettierplot. Please run:"
    )
    print("\tpip install mlmachine")
    print("\tpip install prettierplot")

## Data

<a id = 'Data'></a>

In [None]:
# load data and print dimensions
dfTrain, dfValid = data.titanic()
# dfTrain = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic/train.csv")
# dfValid = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic/test.csv")

print("Training data dimensions: {}".format(dfTrain.shape))
print("Validation data dimensions: {}".format(dfValid.shape))

In [None]:
# display info and first 5 rows
dfTrain.info()
display(dfTrain[:5])

In [None]:
# review counts of different column types
dfTrain.dtypes.value_counts()

In [None]:
# Load training data into mlmachine
train = mlm.Machine(
    data=dfTrain,
    target="Survived",
    removeFeatures=["PassengerId", "Ticket"],
    overrideCat=["Pclass"],
    targetType="categorical",
)
print(train.data.shape)

In [None]:
# load training data into mlmachine
valid = mlm.Machine(
    data=dfValid,
    removeFeatures=["PassengerId", "Ticket"],
    overrideCat=["Pclass"],
)
print(valid.data.shape)

# EDA

<a id = 'EDA'></a>

## Categorical feature EDA

<a id = 'Categorical-feature-EDA'></a>

##### Univariate & feature vs. target

In [None]:
# categorical features
for feature in train.featureType["categorical"]:
    train.edaCatTargetCatFeat(feature=feature, levelCountCap=50)

## numeric feature EDA

<a id = 'numeric-feature-EDA'></a>

##### Univariate & feature vs. target

In [None]:
# numeric features
for feature in train.featureType["numeric"]:
    train.edaCatTargetNumFeat(feature=feature)

##### Correlation

###### Correlation (all samples)

In [None]:
# correlation heat map
p = PrettierPlot()
ax = p.makeCanvas()
p.prettyCorrHeatmap(df=train.data, annot=True, ax=ax)

###### Correlation (top vs. target)

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plotOrientation='tall',chartProp=10)
ax = p.makeCanvas()
p.prettyCorrHeatmapTarget(
    df=train.data, target=train.target, thresh=0.01, annot=True, ax=ax
)

##### Pair plot

<a id = 'Pair-plot'></a>

In [None]:
# # pair plot
# p = PrettierPlot(chartProp=12)
# p.prettyPairPlot(df=train.data[["Age","Fare","Sex","Pclass","Embarked"]].dropna(), diag_kind="auto")

In [None]:
# # pair plot
# p = PrettierPlot(chartProp=12)
# p.prettyPairPlot(
#     df=train.data.dropna(),
#     diag_kind="kde",
#     target=train.target,
#     cols=["Age", "Fare", "Pclass", "Parch", "SibSp"],
#     legendLabels=["Died", "Survived"],
#     bbox=(2.0, 0.0),
# )

## Faceting

<a id = 'Faceting'></a>

##### Categorical by categorical

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Survivorship, embark location by passenger class", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.recombineData(train.data, train.target),
    x="Embarked",
    y=train.target.name,
    split="Pclass",
    yUnits="ff",
    ax=ax,
)

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Survivorship, passenger class by gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.recombineData(train.data, train.target),
    x="Pclass",
    y=train.target.name,
    split="Sex",
    yUnits="ff",
    ax=ax,
)

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chartProp=12)
ax = p.makeCanvas(title="Survivorship,embark location by gender", yShift=0.7)
p.prettyFacetTwoCatBar(
    df=train.recombineData(train.data, train.target),
    x="Embarked",
    y=train.target.name,
    split="Sex",
    yUnits="ff",
    ax=ax,
)

In [None]:
#
p = PrettierPlot()
p.prettyFacetTwoCatPoint(
    df=train.recombineData(train.data, train.target),
    x="Sex",
    y=train.target.name,
    split="Pclass",
    catCol="Embarked",
    aspect=1.0,
    height=5,
    bbox=(1.3, 1.2),
    legendLabels=["1st class", "2nd class", "3rd class"],
)

In [None]:
#
p = PrettierPlot()
p.prettyFacetTwoCatPoint(
    df=train.recombineData(train.data, train.target).dropna(subset=["Embarked"]),
    x="Embarked",
    y=train.target.name,
    split="Pclass",
    catCol="Sex",
    aspect=1.0,
    height=5,
    bbox=(1.5, 0.8),
    legendLabels=["1st class", "2nd class", "3rd class"],
)

##### Categorical by numeric

In [None]:
#
p = PrettierPlot()
p.prettyFacetCatNumHist(
    df=train.recombineData(train.data, train.target),
    split=train.target.name,
    legendLabels=["Died", "Lived"],
    catRow="Sex",
    catCol="Embarked",
    numCol="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)

In [None]:
#
p = PrettierPlot(chartProp=15)
p.prettyFacetCatNumScatter(
    df=train.recombineData(train.data, train.target),
    split=train.target.name,
    legendLabels=["Died", "Lived"],
    catRow="Sex",
    catCol="Embarked",
    x="Fare",
    y="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)

## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# null score
pd.Series(train.target).value_counts(normalize=True)

# Data preparation

<a id = 'Data-preparation'></a>

## Missing data


<a id = 'Missing-data'></a>

##### Training

In [None]:
# evaluate missing data
train.edaMissingSummary()

In [None]:
# missingno matrix
msno.matrix(train.data)

In [None]:
# missingno bar
msno.bar(train.data)

In [None]:
# missingno heatmap
msno.heatmap(train.data)

In [None]:
# missingno dendrogram
msno.dendrogram(train.data)

##### Validation

In [None]:
# evaluate missing data
valid.edaMissingSummary()

In [None]:
# missingno matrix
msno.matrix(valid.data)

In [None]:
# missingno bar
msno.bar(valid.data)

In [None]:
# missingno heatmap
msno.heatmap(valid.data)

In [None]:
# missingno dendrogram
msno.dendrogram(valid.data)

##### Training vs. validation


In [None]:
# compare feature with missing data
train.missingColCompare(train=train.data, validation=valid.data)

##### Impute

In [None]:
# impute pipeline
imputePipe = PandasFeatureUnion([
    ("age", pipeline.make_pipeline(
        DataFrameSelector(["Age","Pclass"]),
        ContextImputer(nullCol="Age", contextCol="Pclass", strategy="mean")
    )),
    ("fare", pipeline.make_pipeline(
        DataFrameSelector(["Fare","Pclass"]),
        ContextImputer(nullCol="Fare", contextCol="Pclass")
    )),
    ("embarked", pipeline.make_pipeline(
        DataFrameSelector(["Embarked"]),
        PlayWithPandas(impute.SimpleImputer(strategy="most_frequent"))
    )),
    ("cabin", pipeline.make_pipeline(
        DataFrameSelector(["Cabin"]),
        PlayWithPandas(impute.SimpleImputer(strategy="constant", fill_value="X"))
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(["Age","Fare","Embarked","Cabin"]))),
    )),
])

train.data = imputePipe.fit_transform(train.data)
valid.data = imputePipe.transform(valid.data)

In [None]:
train.edaMissingSummary()

In [None]:
valid.edaMissingSummary()

## Engineering

<a id = 'Engineering'></a>

##### Training

In [None]:
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in train.data["Name"]]
train.data["Title"] = pd.Series(
    title,
    index=train.data.index,
    dtype="category",
)
train.data["Title"] = train.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
train.data["Title"] = train.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)
train.data["Title"] = train.data["Title"].astype("category")

# distill cabin feature
train.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in train.data["Cabin"]],
    index=train.data.index,
    dtype="category",
)

# family size features and binning
train.data["FamilySize"] = train.data["SibSp"] + train.data["Parch"] + 1
train.data["FamilySize"] = train.data["FamilySize"].astype("int64")

# update featureType and drop columns
train.featureTypeUpdate(columnsToDrop=["Name","Cabin"])

##### Validation

In [None]:
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in valid.data["Name"]]
valid.data["Title"] = pd.Series(
    title,
    index=valid.data.index,
    dtype="category",
)
valid.data["Title"] = valid.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
valid.data["Title"] = valid.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)
valid.data["Title"] = valid.data["Title"].astype("category")

# distill cabin feature
valid.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in valid.data["Cabin"]],
    index=valid.data.index,
    dtype="category",
)

# additional features
valid.data["FamilySize"] = valid.data["SibSp"] + valid.data["Parch"] + 1
valid.data["FamilySize"] = valid.data["FamilySize"].astype("int64")

# update featureType and drop columns
valid.featureTypeUpdate(columnsToDrop=["Name","Cabin"])

## Encoding

<a id = 'Encoding'></a>

##### Training

In [None]:
# counts of unique values in training data string columns
train.data[train.featureType["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in train.data[train.featureType["categorical"]]:
    print(col, np.unique(train.data[col]))

##### Validation

In [None]:
# counts of unique values in validation data string columns
valid.data[valid.featureType["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in valid.data[valid.featureType["categorical"]]:
    print(col, np.unique(valid.data[col]))

##### Training vs. validation

In [None]:
# identify values that are present in the training data but not the validation data, and vice versa
for col in train.featureType["categorical"]:
    if col not in ["Name", "Cabin"]:
        trainValues = train.data[col].unique()
        validValues = valid.data[col].unique()

        trainDiff = set(trainValues) - set(validValues)
        validDiff = set(validValues) - set(trainValues)

        if len(trainDiff) > 0 or len(validDiff) > 0:
            print("\n\n*** " + col)
            print("Value present in training data, not in validation data")
            print(trainDiff)
            print("Value present in validation data, not in training data")
            print(validDiff)

##### Encode

In [None]:
#
nominalColumns = ["Embarked","Sex","CabinQuarter","Title"]

ordinalColumns = ["Pclass"]
ordinalEncodings = [
        [0, 1, 2, 3], # Pclass 
    ]

# encode pipeline
encodePipe = PandasFeatureUnion([
    ("ordinal", pipeline.make_pipeline(
        DataFrameSelector(ordinalColumns),
        PlayWithPandas(preprocessing.OrdinalEncoder(categories=ordinalEncodings)),
    )),
    ("nominal", pipeline.make_pipeline(
        DataFrameSelector(nominalColumns),
        PlayWithPandas(preprocessing.OneHotEncoder(drop="first")),
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(nominalColumns + ordinalColumns))),
    )),
])

train.data = encodePipe.fit_transform(train.data)
valid.data = encodePipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

## Transformation

<a id = 'Transformation'></a>

### Polynomial features

<a id = 'Polynomial-features'></a>

##### Transform

In [None]:
# transform pipe
polynomialPipe = PandasFeatureUnion([
    ("polynomial", pipeline.make_pipeline(
        DataFrameSelector(train.featureType["numeric"]),
        PlayWithPandas(preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False))
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(train.featureType["numeric"]))),
    )),
])

train.data = polynomialPipe.fit_transform(train.data)
valid.data = polynomialPipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

### Skew

<a id = 'Skew'></a>

##### Training

In [None]:
# evaluate skew of numeric features - training data
train.skewSummary()

##### Validation

In [None]:
# evaluate skew of numeric features - validation data
valid.skewSummary()

##### Transform

In [None]:
# skew correction pipeline
skewPipe = PandasFeatureUnion([
    ("skew", pipeline.make_pipeline(
        DataFrameSelector(train.featureType["numeric"]),
        DualTransformer(),
    )),    
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(train.featureType["numeric"]))),
    )),
])

train.data = skewPipe.fit_transform(train.data)
valid.data = skewPipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

### Scale

<a id = 'Scale'></a>

##### Transform

In [None]:
#
scalePipe = PandasFeatureUnion([
    ("scale", pipeline.make_pipeline(
        DataFrameSelector(train.featureType["numeric"]),
        PlayWithPandas(preprocessing.StandardScaler())
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(train.featureType["numeric"]))),
    )),
])

train.data = scalePipe.fit_transform(train.data)
valid.data = scalePipe.transform(valid.data)

train.featureTypeUpdate()
valid.featureTypeUpdate()

## Outliers


<a id = 'Outliers'></a>

In [None]:
# identify outliers using IQR
trainPipe = pipeline.Pipeline([
    ("outlier",train.OutlierIQR(
                outlierCount=25,
                iqrStep=1.5,
                features=train.featureType["numeric"],
                dropOutliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
iqrOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(iqrOutliers)

In [None]:
# identify outliers using Isolation Forest
clf = ensemble.IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.01
)
clf.fit(train.data[train.data.columns])
preds = clf.predict(train.data[train.data.columns])

# evaluate index values
mask = np.isin(preds, -1)
ifOutliers = np.array(train.data[mask].index)
print(ifOutliers)

In [None]:
# identify outliers using extended isolation forest
trainPipe = pipeline.Pipeline([
    ("outlier",train.ExtendedIsoForest(
                cols=train.featureType["numeric"],
                nTrees=100,
                sampleSize=256,
                ExtensionLevel=1,
                anomaliesRatio=0.03,
                dropOutliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
eifOutliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(eifOutliers)

In [None]:
# identify outliers that are identified in multiple algorithms
outliers = reduce(np.intersect1d, (iqrOutliers, ifOutliers, eifOutliers))
# outliers = reduce(np.intersect1d, (ifOutliers, eifOutliers))
print(outliers)

In [None]:
# review outlier identification summary
outlierSummary = train.outlierSummary(iqrOutliers=iqrOutliers,
                             ifOutliers=ifOutliers,
                             eifOutliers=eifOutliers
                            )
outlierSummary[outlierSummary["Count"] >= 3]

In [None]:
# remove outlers from predictors and response
outliers = np.array([27, 88, 258, 311, 341, 438, 679, 737, 742])
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

# Feature importance

<a id = 'Feature-importance'></a>

In [None]:
# generate feature importance summary
estimators = [
    lightgbm.LGBMClassifier,
    ensemble.RandomForestClassifier,
    ensemble.GradientBoostingClassifier,
    ensemble.ExtraTreesClassifier,
    ensemble.AdaBoostClassifier,
    xgboost.XGBClassifier,
]

fs = train.FeatureSelector(
    data=train.data, target=train.target, estimators=estimators, rank=True
)
# featureSelectorSummary = fs.featureSelectorSuite()
# featureSelectorSummary[:20]

In [None]:
# calculate cross-validation performance
estimators = [
    svm.SVC,
    lightgbm.LGBMClassifier,
    linear_model.LogisticRegression,
    xgboost.XGBClassifier,
    ensemble.RandomForestClassifier,
    ensemble.GradientBoostingClassifier,
    ensemble.AdaBoostClassifier,
    ensemble.ExtraTreesClassifier,
    neighbors.KNeighborsClassifier,
]

cvSummary = fs.featureSelectorCrossVal(
    featureSelectorSummary = pd.read_csv("featureSelectionSummary_20191028_173422.csv", index_col=0),
    estimators=estimators,
    scoring=["accuracy"],
    nFolds=8,
    step=1,
    nJobs=8
)

###### Accuracy

In [None]:
# visualize CV performance for diminishing feature set
fs.featureSelectorResultsPlot(
    metric="accuracy",
    cvSummary= pd.read_csv("cvSummary_20191028_173925.csv", index_col=0),
    featureSelectorSummary=pd.read_csv("featureSelectionSummary_20191028_173422.csv", index_col=0),
    titleScale=0.8,
)

In [None]:
crossValFeaturesDf = fs.createCrossValFeaturesDf(
    metric="accuracy",
    cvSummary= pd.read_csv("cvSummary_20191028_173925.csv", index_col=0),
    featureSelectorSummary=pd.read_csv("featureSelectionSummary_20191028_173422.csv", index_col=0),
)
crossValFeaturesDf#[:5]

In [None]:
crossValFeaturesDict = fs.createCrossValFeaturesDict(
    crossValFeaturesDf=crossValFeaturesDf
)
# crossValFeaturesDict

# Modeling

<a id = 'Modeling'></a>

## Data preparation

<a id = 'Data-preparation-1'></a>

##### Prepare data

In [None]:
#################################################################################
# import training data
dfTrain, dfValid = data.titanic()
# dfTrain = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic/train.csv")
train = mlm.Machine(
    data=dfTrain,
    target="Survived",
    removeFeatures=["PassengerId", "Ticket"],
    overrideCat=["Pclass"],
    targetType="categorical",
)

# import validation data
# dfValid = pd.read_csv("s3://tdp-ml-datasets/kaggle-titanic/test.csv")
valid = mlm.Machine(
    data=dfValid,
    removeFeatures=["PassengerId", "Ticket"],
    overrideCat=["Pclass"],
)

#################################################################################
# impute pipeline
imputePipe = PandasFeatureUnion([
    ("age", pipeline.make_pipeline(
        DataFrameSelector(["Age","Pclass"]),
        ContextImputer(nullCol="Age", contextCol="Pclass", strategy="mean")
    )),
    ("fare", pipeline.make_pipeline(
        DataFrameSelector(["Fare","Pclass"]),
        ContextImputer(nullCol="Fare", contextCol="Pclass")
    )),
    ("embarked", pipeline.make_pipeline(
        DataFrameSelector(["Embarked"]),
        PlayWithPandas(impute.SimpleImputer(strategy="most_frequent"))
    )),
    ("cabin", pipeline.make_pipeline(
        DataFrameSelector(["Cabin"]),
        PlayWithPandas(impute.SimpleImputer(strategy="constant", fill_value="X"))
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(["Age","Fare","Embarked","Cabin"]))),
    )),
])

train.data = imputePipe.fit_transform(train.data)
valid.data = imputePipe.transform(valid.data)

#################################################################################
# feature engineering - training

# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in train.data["Name"]]
train.data["Title"] = pd.Series(
    title,
    index=train.data.index,
    dtype="category",
)
train.data["Title"] = train.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
train.data["Title"] = train.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)
train.data["Title"] = train.data["Title"].astype("category")

# distill cabin feature
train.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in train.data["Cabin"]],
    index=train.data.index,
    dtype="category",
)

# family size features
train.data["FamilySize"] = train.data["SibSp"] + train.data["Parch"] + 1
train.data["FamilySize"] = train.data["FamilySize"].astype("int64")

train.featureTypeUpdate(columnsToDrop=["Name","Cabin"])

#################################################################################
# feature engineering - validation

# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in valid.data["Name"]]
valid.data["Title"] = pd.Series(
    title,
    index=valid.data.index,
    dtype="category"
)
valid.data["Title"] = valid.data["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
valid.data["Title"] = valid.data["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)
valid.data["Title"] = valid.data["Title"].astype("category")

# distill cabin feature
valid.data["CabinQuarter"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in valid.data["Cabin"]],
    index=valid.data.index,
    dtype="category",
)

# additional features
valid.data["FamilySize"] = valid.data["SibSp"] + valid.data["Parch"] + 1
valid.data["FamilySize"] = valid.data["FamilySize"].astype("int64")

valid.featureTypeUpdate(columnsToDrop=["Name","Cabin"])

# #################################################################################
# # feature transformation pipeline
# nominalColumns = ["Embarked","Sex","CabinQuarter","Title"]

# ordinalColumns = ["Pclass"]
# ordinalEncodings = [
#         [0, 1, 2, 3], # Pclass 
#     ]

# transformPipe = PandasFeatureUnion([
#     ("ordinal", pipeline.make_pipeline(
#         DataFrameSelector(ordinalColumns),
#         PlayWithPandas(preprocessing.OrdinalEncoder(categories=ordinalEncodings)),
#     )),
#     ("nominal", pipeline.make_pipeline(
#         DataFrameSelector(nominalColumns),
#         PlayWithPandas(preprocessing.OneHotEncoder(drop="first")),
#     )),
#     ("numeric", pipeline.make_pipeline(
#         DataFrameSelector(train.featureType["numeric"]),
#         PlayWithPandas(preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
# #         DualTransformer(),
#         PlayWithPandas(preprocessing.StandardScaler())
#     )),
#     ("diff", pipeline.make_pipeline(
#         DataFrameSelector(list(set(train.data.columns).difference(nominalColumns + ordinalColumns + train.featureType["numeric"]))),
#     )),
# ])

# train.data = transformPipe.fit_transform(train.data)
# valid.data = transformPipe.transform(valid.data)

# train.featureTypeUpdate()
# valid.featureTypeUpdate()

# #################################################################################
# # remove outliers
# outliers = np.array([27, 88, 258, 311, 341, 438, 679, 737, 742])
# train.data = train.data.drop(outliers)
# train.target = train.target.drop(index=outliers)

# # # accuracy >= 7
# # bestCols = ['Age*Fare','Title_2','Fare*FamilySize','Sex_male','Fare','Pclass','CabinQuarter_X']
# # train.data = train.data[bestCols]
# # valid.data = valid.data[bestCols]

# print('completed')

#### Category encoding

- to do
    - feature engineering
        - target encoding
            - categorical target 
                - categorical features
                - numeric features
                    - bin first (ability to controls bins)
                    - then target encode the binned numeric features against the categorical target
            - numeric target
                - categorical features
                - numeric features
            - general
                - feature names suffixed with "_[FEATURE NAME]"
                - methodologies
                    - categorical target
                        - categorical features
                            - 1 - blend of posterior probability of target given a particular level over(?) the prior probability of the target over all training data.
                                - What exactly does blending mean?
                            - 2 - just posterior probability of target given a particular level
                        - numeric features
                            - binning of numeric features, plus the categorical feature methodologies described above
                    - continuous target
                        - categorical features
                            - blend of mean value of target given a particular categorical level and the expected value of the target over(?) all of the training data
                                - what exactly does blending mean?
                        - numeric features
                            - binning of numeric features, plus the categorical feature methodologies described above
                        - general
                            - can take mean, median, mode, std

        - count encoding
            - categorical features
                - count ofthe total number of appearances of that level
            - numeric features
                - not sure that his is possible
            - general
                k fold this? probably
        - binary encoding
            - categorical features
                - When the number of levels in a categorical features reaches a certain level
            - numeric features
                - not sure that his is possible
        - optimal binning (the maxhalfford.github site)
        - additive smoothing (the maxhalfford.github site)
        - binning
            - categorical features
                - not necessary
            - numeric features
                - for each value, replace with the bin label
        - numeric-specific transformations
            - Percentiles
                - for each value, replace with percentile of that value. i could see this helping to diminish the effect of outliers
            - ratios/quotients
            - sums
            - products
            - differences
    - model evaluation
        - incorrect predictions
            - false positives
            - false negatives
   

__feature selection ideas__

- Single Feature performance
    - determine performance for each feature individually
    - by training a model like XGBoost on each feature seperately or calculating gini coefficients on binned versions of the features.
- forward selection (check out mlxtend)
- backwards elimination (i think i already do this with RFE)
- noise injection to idenitfy unimportant featuers
    - i believe the idea is that any actual features that are less important than the noise featuers can be considered worthless

__cross validation__

- 5 fold
- use stratified k-fold when the test data is not in the future


- http://blog.kaggle.com/2016/04/08/homesite-quote-conversion-winners-write-up-1st-place-kazanova-faron-clobber/
- https://www.kaggle.com/matleonard/categorical-encodings
- http://www.kazanovaforanalytics.com/software.html
- https://github.com/Far0n/xgbfi
- https://towardsdatascience.com/my-secret-sauce-to-be-in-top-2-of-a-kaggle-competition-57cff0677d3c
- https://contrib.scikit-learn.org/categorical-encoding/
- https://github.com/scikit-learn-contrib/categorical-encoding/tree/master/category_encoders
- https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
- http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/target-encoding.html
- https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b






----------------------------------------------------------------------------------------------------------



###### binary

In [None]:
import sklearn.model_selection as model_selection
from collections import defaultdict

X = train.data
X_t = train.target
Y = valid.data

imputePipe = PandasFeatureUnion([
#     ("catcount", pipeline.make_pipeline(
#         DataFrameSelector(["Embarked","CabinQuarter"]),
#         PlayWithPandas(ce.CountEncoder(return_df=False))
#     )),
#     ("binar", pipeline.make_pipeline(
#         DataFrameSelector(["Embarked","CabinQuarter"]),
#         PlayWithPandas(ce.BinaryEncoder(return_df=False))
#     )),
#     ("nominal", pipeline.make_pipeline(
#         DataFrameSelector(["Embarked","CabinQuarter"]),
#         PlayWithPandas(preprocessing.OneHotEncoder(drop="first")),
#     )),
    ("num3", pipeline.make_pipeline(
        DataFrameSelector(["Embarked","Age","Fare"]),
        KFoldTargetEncoderTrain(
            target=train.target,
            cv=model_selection.KFold(n_splits=5, shuffle=False, random_state=0),
            nBins = {"Age" : 5, "Fare" : 10},
            dropBinCols=False,
            dropOriginalCols=True,
        )
    )),
    ("norm", pipeline.make_pipeline(
        DataFrameSelector(["Age","Fare"]),
        PlayWithPandas(preprocessing.QuantileTransformer(output_distribution="normal")),
    )),
    ("uni", pipeline.make_pipeline(
        DataFrameSelector(["Age","Fare"]),
        PlayWithPandas(preprocessing.QuantileTransformer(output_distribution="uniform")),
    )),
#     ("nu2", pipeline.make_pipeline(
#         DataFrameSelector(["Age","Fare"]),
#         PlayWithPandas(preprocessing.KBinsDiscretizer(n_bins=10, encode="ordinal")),
#     )),
    ("select2", pipeline.make_pipeline(
        DataFrameSelector(["Embarked","Age","Fare"]),
    )),
])

X = imputePipe.fit_transform(train.data)
y = imputePipe.transform(valid.data)
display(X[:5])
display(y[:5])

In [None]:
X[""]

In [None]:
import itertools  

df = X[["Age","Fare","Age_Quantile_normal","Fare_Quantile_uniform"]]

# create list of tuples containing columns pairs
pairs = list(itertools.product(df, df))

# remove tuples if both items in pair are the same
for x in pairs:
    if len(set(x)) == 1:
        pairs = [i for i in pairs if i != x]

# if two tuples contain the same two items, keep only one
for x in pairs:
    if pairs[0][::-1] in pairs[1:]:
        pairs = pairs[1:]

#
for x in pairs:
    df["{} + {}".format(x[0],x[1])] = df[x[0]] + df[x[1]]
    df["{} / {}".format(x[0],x[1])] = df[x[0]] / df[x[1]]
    df["{} * {}".format(x[0],x[1])] = df[x[0]] * df[x[1]]

df = df.merge(train.target, left_index=True, right_index=True)

XX = df.iloc[:,:-1]
YY = df.iloc[:,-1]

In [None]:
XX

In [None]:
results = pd.DataFrame(columns=["col","score"])

import xgboost
import sklearn.linear_model as linear_model
for x in XX.columns:
#     obj = linear_model.LogisticRegression()
    obj = xgboost.XGBClassifier()

    obj.fit(XX[[x]], YY)
    preds = obj.predict(XX[[x]])
    
    acc = sum(YY == preds) / len(YY)
    results.loc[len(results)] = [x, acc]
results

In [None]:
results.sort_values(["score"], ascending=False)

###### target

In [None]:
age = d[["Age"]].drop_duplicates()
ix = np.arange(1, len(age.index)+1)
plt.plot(ix, age)

In [None]:
fare = d[["Fare"]].drop_duplicates()
ix = np.arange(1, len(fare.index)+1)
plt.plot(ix, fare)

In [None]:
d[["Age"]].drop_duplicates()

In [None]:
X.filter(regex="RAD|CHAS")[:20]

## Bayesian hyper-parameter optimization

<a id = 'Bayesian-hyper-parameter-optimization'></a>

In [None]:
# parameter space
allSpace = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "linear_model.LogisticRegression": {
        "C": hp.uniform("C", 0.04, 0.1),
        "penalty": hp.choice("penalty", ["l1"]),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.AdaBoostClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "learning_rate": hp.quniform("learning_rate", 0.1, 0.25, 0.01),
        "algorithm": hp.choice("algorithm", ["SAMME"]),
    },
    "ensemble.ExtraTreesClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(4, 30, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto"]),
        "criterion": hp.choice("criterion", ["entropy"]),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 4, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
    "neighbors.KNeighborsClassifier": {
        "algorithm": hp.choice("algorithm", ["ball_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 15, dtype=int)),
        "weights": hp.choice("weights", ["uniform"]),
    },
}

In [None]:
# execute bayesian optimization grid search
train.execBayesOptimSearch(
    allSpace=allSpace,
    data=train.data,
    target=train.target,
    columns=crossValFeaturesDict,
    scoring="accuracy",
    nFolds=2,
    nJobs=8,
    iters=1,
    verbose=0,
)

##### Model loss by iteration

In [None]:
# read scores summary table
bayesOptimSummary = pd.read_csv("bayesOptimizationSummary_accuracy_20191028_174038.csv", na_values="nan")
bayesOptimSummary[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayesOptimSummary["estimator"]):
    train.modelLossPlot(bayesOptimSummary=bayesOptimSummary, estimator=estimator)

##### Parameter selection by iteration

In [None]:
# estimator parameter plots
for estimator in np.unique(bayesOptimSummary["estimator"]):
    train.modelParamPlot(
        bayesOptimSummary=bayesOptimSummary,
        estimator=estimator,
        allSpace=allSpace,
        nIter=100,
#         chartProp=15,
        titleScale=0.8
    )

In [None]:
sampleSpace = {
                'param': hp.uniform('param', np.log(0.4), np.log(0.6))
#     "": 0.000001 + hp.uniform("gamma", 0.000001, 10)
    #             'param2': hp.loguniform('param2', np.log(0.001), np.log(0.01))
}

train.samplePlot(sampleSpace, 1000)

In [None]:
# pair-wise comparison
p = PrettierPlot(chartProp=12)
p.prettyPairPlotCustom(
    df=train.unpackBayesOptimSummary(bayesOptimSummary, "lightgbm.LGBMClassifier"),
    cols=["colsample_bytree", "learning_rate", "iteration","iterLoss"],
    gradientCol="iteration",
    color=style.styleGrey
)

## Model performance evaluation - standard models

<a id = 'Model-performance-evaluation-standard-models'></a>

In [None]:
topModels = train.topBayesOptimModels(bayesOptimSummary=bayesOptimSummary, numModels=1)
topModels

In [None]:
# classification panel, single model
# estimator = "svm.SVC"; modelIter = 1224
# estimator = 'ensemble.GradientBoostingClassifier'; modelIter = 590
estimator = 'xgboost.XGBClassifier'; modelIter = 1256

model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummary, estimator=estimator, modelIter=modelIter
)

train.classificationPanel(
    model=model,
    XTrain=train.data,
    yTrain=train.target,
    cmLabels=["Dies", "Survives"],
    nFolds=5,
    titleScale=0.9
)

In [None]:
# create classification reports for training data
for estimator, modelIters in topModels.items():
    for modelIter in modelIters:
        model = train.BayesOptimModelBuilder(
            bayesOptimSummary=bayesOptimSummary,
            estimator=estimator,
            modelIter=modelIter,
        )
        train.classificationPanel(
            model=model, XTrain=train.data, yTrain=train.target, cmLabels=['Dies', 'Survives'], nFolds=4
        )

## Model explanability

<a id = 'Feature-importance'></a>

In [None]:
# 
# estimator = "ensemble.ExtraTreesClassifier"; modelIter = 145
# estimator = "svm.SVC"; modelIter = 135
estimator = "ensemble.GradientBoostingClassifier"; modelIter = 1385

model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummary, estimator=estimator, modelIter=modelIter
)
model.fit(train.data.values, train.target.values)

##### Permutation importance

In [None]:
# permutation importance - how much does performance decrease when shuffling a certain feature?
perm = PermutationImportance(model.model, random_state=1).fit(train.data, train.target)
eli5.show_weights(perm, feature_names=train.data.columns.tolist())

##### SHAP values - training data

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in train.data.index[:5]:
    train.singleShapVizTree(obsIx=i, model=model, data=train.data, target=train.target)

###### Force plots - multiple observations

In [None]:
# SHAP force plot a set of data
visual = train.multiShapVizTree(obsIxs=train.data.index, model=model, data=train.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obsData, _, obsShapValues = train.multiShapValueTree(
    obsIxs=train.data.index, model=model, data=train.data
)

In [None]:
# SHAP dependence plot grid
# gridFeatures = ["Pclass", "Age", "Fare", "SibSp","Parch"]
gridFeatures = ['Age*Fare','Title_2','Fare*FamilySize','Sex_male','Fare','Pclass','CabinQuarter_X']

train.shapDependenceGrid(
    obsData=obsData,
    obsShapValues=obsShapValues,
    gridFeatures=gridFeatures,
    allFeatures=train.data.columns,
    dotSize=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.makeCanvas()

train.shapDependencePlot(
    obsData=obsData,
    obsShapValues=obsShapValues,
    scatterFeature="Fare",
    colorFeature="Sex_male",
    featureNames=train.data.columns,
    dotSize=50,
    alpha=0.5,
    ax=ax
)


In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
featureNames = train.data.columns.tolist()
topShap = np.argsort(-np.sum(np.abs(obsShapValues), 0))

for topIx in topShap:
    p = PrettierPlot()
    ax = p.makeCanvas()
    
    train.shapDependencePlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        scatterFeature=featureNames[topIx],
        colorFeature="Fare",
        featureNames=featureNames,
        dotSize=50,
        alpha=0.5,
        ax=ax,
    )

###### Summary plots

In [None]:
# SHAP summary plot
train.shapSummaryPlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        featureNames=train.data.columns,
    )

##### SHAP values - validation data

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in valid.data.index[:2]:
    valid.singleShapVizTree(obsIx=i, model=model, data=valid.data)

###### Force plots - multiple observations

In [None]:
# SHAP force plot a set of data
visual = valid.multiShapVizTree(obsIxs=valid.data.index, model=model, data=valid.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obsData, _, obsShapValues = valid.multiShapValueTree(
    obsIxs=valid.data.index, model=model, data=valid.data
)

In [None]:
# SHAP dependence plot grid
gridFeatures = ["Pclass", "Age", "Fare", "SibSp","Parch"]

valid.shapDependenceGrid(
    obsData=obsData,
    obsShapValues=obsShapValues,
    gridFeatures=gridFeatures,
    allFeatures=valid.data.columns,
    dotSize=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.makeCanvas()

valid.shapDependencePlot(
    obsData=obsData,
    obsShapValues=obsShapValues,
    scatterFeature="Age",
    colorFeature="Parch",
    featureNames=valid.data.columns,
    dotSize=50,
    alpha=0.5,
    ax=ax
)


In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
featureNames = valid.data.columns.tolist()
topShap = np.argsort(-np.sum(np.abs(obsShapValues), 0))

for topIx in topShap:
    p = PrettierPlot()
    ax = p.makeCanvas()
    
    valid.shapDependencePlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        scatterFeature=featureNames[topIx],
        colorFeature="Age",
        featureNames=featureNames,
        dotSize=50,
        alpha=0.5,
        ax=ax,
    )

###### Summary plots

In [None]:
# SHAP summary plot
valid.shapSummaryPlot(
        obsData=obsData,
        obsShapValues=obsShapValues,
        featureNames=valid.data.columns,
    )

## Submission - standard models

<a id = 'Submission-standard-models'></a>

In [None]:
{'lightgbm.LGBMClassifier': [778],
 'linear_model.LogisticRegression': [730],
 'xgboost.XGBClassifier': [371],
 'ensemble.RandomForestClassifier': [712],
 'ensemble.GradientBoostingClassifier': [965],
 'ensemble.AdaBoostClassifier': [512],
 'ensemble.ExtraTreesClassifier': [244],
 'svm.SVC': [551],
 'neighbors.KNeighborsClassifier': [576]}

In [None]:
## standard model fit and predict
# select estimator and iteration
# estimator = "lightgbm.LGBMClassifier"; modelIter = 778  #147 survived, 0.775
# estimator = "xgboost.XGBClassifier"; modelIter = 371  #142 survived, 0.765
# estimator = "ensemble.RandomForestClassifier"; modelIter = 712  #144 survived, 0.7655
# estimator = "ensemble.GradientBoostingClassifier"; modelIter = 965  #144 survived, 0.7561
estimator = "svm.SVC"; modelIter = 551  # survived, 

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummary, estimator=estimator, modelIter=modelIter
)
model.fit(train.data.values, train.target.values)

# fit model and make predictions
yPred = model.predict(valid.data.values)
print(sum(yPred))

In [None]:
# generate prediction submission file
submit = pd.DataFrame({"PassengerId": dfValid.PassengerId, "Survived": yPred})
submit.to_csv("submission.csv", index=False)

# Stacking

<a id = 'Stacking'></a>

## Primary models

<a id = 'Primary-models'></a>

In [None]:
{'lightgbm.LGBMClassifier': [778],
 'linear_model.LogisticRegression': [730],
 'xgboost.XGBClassifier': [371],
 'ensemble.RandomForestClassifier': [712],
 'ensemble.GradientBoostingClassifier': [965],
 'ensemble.AdaBoostClassifier': [512],
 'ensemble.ExtraTreesClassifier': [244],
 'svm.SVC': [551],
 'neighbors.KNeighborsClassifier': [576]}

In [None]:
lgb = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="lightgbm.LGBMClassifier", modelIter=778)
lr = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="linear_model.LogisticRegression", modelIter=730)
xgb = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="xgboost.XGBClassifier", modelIter=371)
rf = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="ensemble.RandomForestClassifier", modelIter=712)
gb = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="ensemble.GradientBoostingClassifier", modelIter=965)
ada = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="ensemble.AdaBoostClassifier", modelIter=512)
ext = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="ensemble.ExtraTreesClassifier", modelIter=244)
svc = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="svm.SVC", modelIter=551)
kn = train.BayesOptimModelBuilder(bayesOptimSummary=bayesOptimSummary, estimator="neighbors.KNeighborsClassifier", modelIter=576)

In [None]:
from vecstack import StackingTransformer
import sklearn.metrics as metrics
# Get your data

# Initialize 1st level estimators
estimators = [('lgb', lgb.model),
              ('lr',lr.model),
              ('xgb',xgb.model),
              ('rf',rf.model),
              ('gb',gb.model),
              ('ada',ada.model),
              ('ext',ext.model),
              ('svc',svc.model),
              ('kn',kn.model),
             ]
              
# Initialize StackingTransformer
stack = StackingTransformer(
    estimators,
    regression=False,
    metric=metrics.accuracy_score,
    verbose=2
)

# Fit
stack = stack.fit(train.data, train.target)

# Get your stacked features
oofTrain = stack.transform(train.data)
oofValid = stack.transform(valid.data)

# Use 2nd level estimator with stacked features

In [None]:
# get out-of-fold predictions
oofTrain, oofValid, columns = train.modelStacker(
    models=topModels,
    bayesOptimSummary=bayesOptimSummary,
    XTrain=train.data.values,
    yTrain=train.target.values,
    XValid=valid.data.values,
    nFolds=10,
    nJobs=10,
)

In [None]:
# view correlations of predictions
p = PrettierPlot()
ax = p.makeCanvas()
p.prettyCorrHeatmap(
    df=pd.DataFrame(oofTrain, columns=columns), annot=True, ax=ax, vmin=0
)
plt.show()

## Meta model

<a id = 'Meta-model'></a>

In [None]:
# parameter space
allSpace = {
    "lightgbm.LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "xgboost.XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "ensemble.RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "ensemble.GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "svm.SVC": {
        "C": hp.uniform("C", 0.00000001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr", "ovo"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
}

In [None]:
# execute bayesian optimization grid search
train.execBayesOptimSearch(
    allSpace=allSpace,
    data=oofTrain,
    target=train.target,
    scoring="accuracy",
    nFolds=8,
    nJobs=8,
    iters=1000,
    verbose=0,
)

In [None]:
df = pd.DataFrame()

In [None]:
type(df)

In [None]:
# read scores summary table
bayesOptimSummaryMeta = pd.read_csv("{}_hyperopt_meta_{}.csv".format(rundate, analysis))
bayesOptimSummaryMeta[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayesOptimSummaryMeta["estimator"]):
    train.modelLossPlot(bayesOptimSummary=bayesOptimSummaryMeta, estimator=estimator)

In [None]:
# estimator parameter plots
for estimator in np.unique(bayesOptimSummaryMeta["estimator"]):
    train.modelParamPlot(
        bayesOptimSummary=bayesOptimSummaryMeta,
        estimator=estimator,
        allSpace=allSpace,
        nIter=100,
        chartProp=15,
    )

## Model performance evaluation - stacked models

<a id = 'Model-performance-evaluation-stacked-models'></a>

In [None]:
topModels = train.topBayesOptimModels(
    bayesOptimSummary=bayesOptimSummaryMeta, numModels=1
)
topModels

In [None]:
# classification panel, single model
estimator = "svm.SVC"; modelIter = 135
# estimator = 'ensemble.GradientBoostingClassifier'; modelIter = 590
# estimator = 'xgboost.XGBClassifier'; modelIter = 380

model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummaryMeta, estimator=estimator, modelIter=modelIter
)

train.classificationPanel(
    model=model, XTrain=oofTrain, yTrain=train.target, labels=[0, 1], nFolds=4
)

In [None]:
# create classification reports for training data
for estimator, modelIters in topModels.items():
    for modelIter in modelIters:
        model = train.BayesOptimModelBuilder(
            bayesOptimSummary=bayesOptimSummaryMeta,
            estimator=estimator,
            modelIter=modelIter,
        )
        train.classificationPanel(
            model=model, XTrain=oofTrain, yTrain=train.target, labels=[0, 1], nFolds=4
        )

## Submission - stacked models

<a id = 'Submission-stacked-models'></a>

In [None]:
# best second level learning model
# estimator = "lightgbm.LGBMClassifier"; modelIter = 876 #0.75119
# estimator = "xgboost.XGBClassifier"; modelIter = 821, #0.779
# estimator = "ensemble.RandomForestClassifier"; modelIter = 82 
# estimator = "ensemble.GradientBoostingClassifier"; modelIter = 673 #0.77511
estimator = "svm.SVC"; modelIter = 538 # 0.77511

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayesOptimSummary=bayesOptimSummaryMeta, estimator=estimator, modelIter=modelIter
)

model.fit(oofTrain, train.target.values)
yPred = model.predict(oofValid)
print(sum(yPred))

In [None]:
# generate prediction submission file
submit = pd.DataFrame({"PassengerId": dfValid.PassengerId, "Survived": yPred})
submit.to_csv("submission.csv", index=False)