__Kaggle competition - house prices__

1. [Kaggle competition - house prices](#Kaggle-competition-house-prices)
1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [Initial EDA](#Initial-EDA)
    1. [Categorical feature EDA](#Categorical-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target)
    1. [Continuous feature EDA](#Continuous-feature-EDA)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target2)
        1. [Correlation](#Correlation)
            1. [Correlation (all samples)](#Correlation-all-samples)
            1. [Correlation (top vs. target)](#Correlation-top-vs-target)
        1. [Pair plot](#Pair-plot)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    
1. [Data cleaning](#Data-cleaning)
    1. [Outliers (preliminary)](#Outliers-preliminary)
        1. [Training](#Training5)
        1. [Validation](#Validation5)
    1. [Missing data](#Missing-data)
        1. [Evaluate](#Evaluate1)
        1. [Training](#Training1)
        1. [Validation](#Validation1)
    1. [Engineering](#Engineering)
        1. [Evaluate](#Evaluate3)
        1. [Training](#Training3)
        1. [Validation](#Validation3)
    1. [Encoding](#Encoding)
        1. [Evaluate](#Evaluate2)
        1. [Training](#Training2)
        1. [Validation](#Validation2)
    1. [Transformation](#Transformation)
        1. [Evaluate](#Evaluate4)
        1. [Training](#Training4)
        1. [Validation](#Validation4)
    1. [Outliers (final)](#Outliers-final)
        1. [Training](#Training6)
1. [Data evaluation](#Data-evaluation)
    1. [Feature importance](#Feature-importance)    
    1. [Rationality](#Rationality)
    1. [Value override](#Value-override)
    1. [Continuous feature EDA](#Continuous-feature-EDA3)
        1. [Univariate & feature vs. target](#Univariate-&-feature-vs.-target3)
        1. [Correlation](#Correlation3)
            1. [Correlation (top vs. target)](#Correlation-top-vs-target3)
1. [Modeling](#Modeling)
    1. [Prepare training data](#Prepare-training-data)
    1. [Prepare validation data](#Prepare-validation-data)
    1. [GridSearch](#GridSearch)
        1. [Evaluation](#Evaluation)
        1. [Model explanability](#Model-explanability)
            1. [Permutation importance](#Permutation-importance)
            1. [Partial plots](#Partial-plots)
            1. [SHAP values](#SHAP-values)
    1. [Stacking](#Stacking)
        1. [Primary models](#Primary-models)
        1. [Meta model](#Meta-model)                
1. [Submission](#Submission)
    1. [Stack](#Stack)
    1. [Standard](#Standard)


# Kaggle competition - Titanic



<a id = 'Kaggle-competition-house-prices'></a>

# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [None]:
# Standard libary and settings
import os
import sys
import importlib
import itertools
import csv
import ast
from timeit import default_timer as timer
global ITERATION
import time
rundate = time.strftime('%Y%m%d')
comp = 'titanic'

import warnings; warnings.simplefilter('ignore')
from IPython.core.display import display, HTML; display(HTML("<style>.container { width:95% !important; }</style>"))

# Data extensions and settings
import numpy as np
np.set_printoptions(threshold = np.inf, suppress = True)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.float_format = '{:,.6f}'.format

# Modeling extensions
import sklearn.base as base
import sklearn.cluster as cluster
import sklearn.datasets as datasets
import sklearn.decomposition as decomposition
import sklearn.discriminant_analysis as discriminant_analysis
import sklearn.ensemble as ensemble
import sklearn.feature_extraction as feature_extraction
import sklearn.feature_selection as feature_selection
import sklearn.gaussian_process as gaussian_process
import sklearn.linear_model as linear_model
import sklearn.kernel_ridge as kernel_ridge
import sklearn.metrics as metrics
import sklearn.model_selection as model_selection
import sklearn.naive_bayes as naive_bayes
import sklearn.neighbors as neighbors
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing
import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.utils as utils

from scipy import stats, special
import xgboost
import lightgbm
import catboost

from hyperopt import hp, tpe, Trials, fmin, STATUS_OK
from hyperopt.pyll.stochastic import sample

# Visualization extensions and settings
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
%matplotlib inline

# Custom extensions and settings
sys.path.append('/main') if '/main' not in sys.path else None
# sys.path.append('C:/Users/petersont/Atheneum/dev') if 'C:/Users/petersont/Atheneum/dev' not in sys.path else None
sys.path.append('U:\\') if 'U:\\' not in sys.path else None

import mlmachine as mlm
import quickplot as qp


## Data

<a id = 'Data'></a>

In [None]:
# Load data and print dimensions
dfTrain = pd.read_csv('data/train.csv')
dfValid = pd.read_csv('data/test.csv')

print('Training data dimensions: {}'.format(dfTrain.shape))
print('Validation data dimensions: {}'.format(dfValid.shape))


In [None]:
# Display info and first 5 rows

dfTrain.info()
display(dfTrain[:5])


In [None]:
# counts of columns types
dfTrain.dtypes.value_counts()


In [None]:
# Load training data into ML machine
importlib.reload(mlm)
train = mlm.Machine(data = dfTrain
                  ,target = ['Survived']
                  ,removeFeatures = ['PassengerId','Ticket']                      
                  ,overrideCat = ['Pclass','SibSp','Parch']
                  ,targetType = 'categorical'
                )
print(train.X_.shape)


In [None]:
# Load training data into ML machine
valid = mlm.Machine(data = dfValid
                  ,removeFeatures = ['PassengerId','Ticket']                      
                  ,overrideCat = ['Pclass','SibSp','Parch']
                )
print(valid.X_.shape)


# Initial EDA

<a id = 'Initial-EDA'></a>

## Categorical feature EDA

<a id = 'Categorical-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target'></a>

In [None]:
# Categorical features
train.edaCatTargetCatFeat(skipCols = ['Name','Cabin'])


## Continuous feature EDA

<a id = 'Continuous-feature-EDA'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target2'></a>

In [None]:
# Continuous features
train.edaCatTargetNumFeat()


### Correlation

<a id = 'Correlation'></a>

#### Correlation (all samples)

<a id = 'Correlation-all-samples'></a>

In [None]:
# correlation heat map 
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 15)
ax = p.makeCanvas()
p.qpCorrHeatmap(df = train.X_
                ,target = train.y_
                ,targetLabel = train.target[0]
                ,cols = None
                ,annot = True
                ,ax = ax
               )


#### Correlation (top vs. target)

<a id = 'Correlation-top-vs-target'></a>

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 10)
ax = p.makeCanvas()
p.qpCorrHeatmapRefine(df = train.X_
                      ,target = train.y_
                      ,targetLabel = train.target[0]
                      ,cols = None
                      ,annot = True
                      ,thresh = 0.2
                      ,ax = ax
                    )


> Remarks - There are three pairs of highly correlated features:
    - 'GarageArea' and 'GarageCars'
    - 'TotRmsAbvGrd' and 'GrLivArea'
    - '1stFlrSF' and 'TotalBsmtSF
This makes sense, given what each feature represents and how each pair items relate to each other. We likely only need one feature from each pair.

### Pair plot

<a id = 'Pair-plot'></a>

In [None]:
# Pair plot
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 10)
p.qpPairPlot(df = train.X_
             ,diag_kind = 'auto')


## Faceting

<a id = 'Faceting'></a>

In [None]:
#
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 10)
ax = p.makeCanvas(title = '', xLabel = '', yLabel = ''
                  ,yShift = 0.8, position = 111)

p.qpTwoCatBar(df = train.X_
               ,x = 'Pclass'
               ,hue = 'Embarked'
               ,target = train.y_
               ,targetLabel = train.target[0]
               ,yUnits = 'p'
               ,ax = ax)
                   

In [None]:
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 15)
# ax = p.makeCanvas(title = '', xLabel = '', yLabel = ''
#                   ,yShift = 0.8, position = 111)

p.qpCatNumHistFacet(df = train.X_
           ,target = train.y_
           ,targetLabel = train.target[0]
           ,catRow = 'Sex'
           ,catCol = 'Embarked'
           ,numCol = 'Age'
           ,height = 3
           ,aspect = 2
           )


In [None]:
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 15)
# ax = p.makeCanvas(title = '', xLabel = '', yLabel = ''
#                   ,yShift = 0.8, position = 111)

p.qpCatNumHistFacet(df = train.X_
           ,target = train.y_
           ,targetLabel = train.target[0]
           ,catRow = 'Sex'
           ,catCol = 'Pclass'
           ,numCol = 'Age'
           ,height = 3
           ,aspect = 2
           )


In [None]:
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 15)

p.qpTwoCatPointFacet(df = train.X_
           ,target = train.y_
           ,targetLabel = train.target[0]
           ,catLine = 'Pclass'
           ,catPoint = 'Sex'
           ,catGrid = 'Embarked'
           ,order = ['female','male'])


In [None]:
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 15)

p.qpTwoCatPointFacet(df = train.X_
           ,target = train.y_
           ,targetLabel = train.target[0]
           ,catLine = 'Sex'
           ,catPoint = 'Pclass'
           ,catGrid = 'Embarked'
           ,order = ['female','male'])


## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# null score
pd.Series(train.y_).value_counts(normalize = True)


# Data cleaning

<a id = 'Data-cleaning'></a>

## Outliers (preliminary)


<a id = 'Outliers-preliminary'></a>

### Training

<a id = 'Training5'></a>

In [None]:
nonNull = train.X_.columns[train.X_.isnull().sum() == 0].values.tolist()
nonNullNumCol = list(set(nonNull).intersection(train.featureByDtype_['continuous']))
print(nonNull)
print(nonNullNumCol)


In [None]:
trainPipe = pipeline.Pipeline([
    ('outlier', train.OutlierIQR(outlierCount = 2, iqrStep = 1.5, features = ['Age','SibSp','Parch','Fare'], dropOutliers = False))     
    ])
train.X_ = trainPipe.transform(train.X_)

iqrOutliers = np.array(sorted(trainPipe.named_steps['outlier'].outliers_))
# train.y_ = np.delete(train.y_, trainPipe.named_steps['outlier'].outliers_)


In [None]:
clf = ensemble.IsolationForest(behaviour = 'new'
                        ,max_samples = train.X_.shape[0]
                        ,random_state = 0
                        ,contamination = 0.02
                        )
clf.fit(train.X_[['SibSp','Parch','Fare']])
preds = clf.predict(train.X_[['SibSp','Parch','Fare']])
# np.unique(preds, return_counts = True)

mask = np.isin(preds, -1)  # np.in1d if np.isin is not available
ifOutliers = np.where(mask)


In [None]:
#
import eif as iso
if_eif = iso.iForest(train.X_[['SibSp','Parch','Fare']].values
                 ,ntrees = 100
                 ,sample_size = 256
                 ,ExtensionLevel = 1
                )

# calculate anomaly scores
anomalies_ratio = 0.02
anomaly_scores = if_eif.compute_paths(X_in = train.X_[['SibSp','Parch','Fare']].values)
anomaly_scores_sorted = np.argsort(anomaly_scores)
eifOutliers = np.array(anomaly_scores_sorted[-int(np.ceil(anomalies_ratio * train.X_.shape[0])):])


In [None]:
#
from functools import reduce
reduce(np.intersect1d, (iqrOutliers, ifOutliers, eifOutliers))


In [None]:
#
outliers = np.array([27,  88, 258, 311, 341, 438, 679, 737, 742])
train.X_ = train.X_.drop(train.X_.index[outliers])
train.y_ = np.delete(train.y_, outliers)


### Validation

<a id = 'Validation5'></a>

## Missing data


<a id = 'Missing-data'></a>

### Evaluate

<a id = 'Evaluate1'></a>

In [None]:
# evaluate missing data
train.edaMissingSummary()


In [None]:
# evaluate missing data
valid.edaMissingSummary()


In [None]:
# missingdata_df = merged_df.columns[merged_df.isnull().any()].tolist()
# msno.matrix(merged_df[missingdata_df])

# msno.bar(merged_df[missingdata_df], color="blue", log=True, figsize=(30,18))

# # 
# msno.heatmap(merged_df[missingdata_df], figsize=(20,20))

In [None]:
# compare feature with missing data
train.missingColCompare(train.X_, valid.X_)


### Training

<a id = 'Training1'></a>

In [None]:
# impute training data
trainPipe = pipeline.Pipeline([
    ('imputeMedian', train.ContextImputer(nullCol = 'Age', contextCol = 'Parch', strategy = 'median'))     
    ,('imputeMode', train.ModeImputer(cols = ['Embarked']))
    ])
train.X_ = trainPipe.transform(train.X_)
train.edaMissingSummary()


### Validation

<a id = 'Validation1'></a>

In [None]:
# impute validation data
validPipe = pipeline.Pipeline([
    ('imputeMedian', valid.ContextImputer(nullCol = 'Age', contextCol = 'Parch', train = False, trainDf = trainPipe.named_steps['imputeMedian'].fillDf))
    ,('imputeMedian2', valid.NumericalImputer(cols = ['Fare','Age'], strategy = 'median'))    
    ])
valid.X_ = validPipe.transform(valid.X_)
valid.edaMissingSummary()


## Engineering

<a id = 'Engineering'></a>

### Evaluate

<a id = 'Evaluate3'></a>

### Training

<a id = 'Training3'></a>

In [None]:
# Parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(',')[1].split('.')[0].strip() for i in train.X_['Name']]
train.X_['Title'] = pd.Series(title)
train.X_['Title'] = train.X_['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
                                            ,'Rare')
train.X_['Title'] = train.X_['Title'].map({'Master' : 0, 'Miss' : 1, 'Ms' : 1 , 'Mme' : 1, 'Mlle' : 1, 'Mrs' : 1, 'Mr' : 2, 'Rare' : 3})

# Distill cabin feature
train.X_['CabinQuarter'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in train.X_['Cabin']])

# Family size features and binning
train.X_['FamilySize'] = train.X_['SibSp'] + train.X_['Parch'] + 1

customBinDict = {'Age' : [16, 32, 48, 64]
                 ,'FamilySize' : [1, 2, 4]
          }

trainPipe = pipeline.Pipeline([
    ('customBin', train.CustomBinner(customBinDict = customBinDict))
    ,('percentileBin', train.PercentileBinner(cols = ['Age','Fare'], percs = [25, 50, 75]))    
    ])
train.X_ = trainPipe.transform(train.X_)

# drop features
train.featureDropper(cols = ['Name','Cabin'])


In [None]:
# print new columns
for col in train.X_.columns:
    if col not in train.featureByDtype_['categorical'] and col not in train.featureByDtype_['continuous']:
        print(col)
        

In [None]:
# append new continuous features
for col in ['FamilySize']:
    train.featureByDtype_['continuous'].append(col)

# append new categorical features
for col in ['AgeCustomBin','AgePercBin','FarePercBin','FamilySize','FamilySizeCustomBin','Title','CabinQuarter']:
    train.featureByDtype_['categorical'].append(col)


In [None]:
# Evaluate additional features
train.edaCatTargetCatFeat()


### Validation

<a id = 'Validation3'></a>

In [None]:
# Parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(',')[1].split('.')[0].strip() for i in valid.X_['Name']]
valid.X_['Title'] = pd.Series(title)
valid.X_['Title'] = valid.X_['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
                                            ,'Rare')
valid.X_['Title'] = valid.X_['Title'].map({'Master' : 0, 'Miss' : 1, 'Ms' : 1 , 'Mme' : 1, 'Mlle' : 1, 'Mrs' : 1, 'Mr' : 2, 'Rare' : 3})

# Distill cabin feature
valid.X_['CabinQuarter'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in valid.X_['Cabin']])

# additional features
valid.X_['FamilySize'] = valid.X_['SibSp'] + valid.X_['Parch'] + 1

validPipe = pipeline.Pipeline([
    ('customBin', valid.CustomBinner(customBinDict = customBinDict))
    ,('percentileBin', valid.PercentileBinner(train = False, trainDict = trainPipe.named_steps['percentileBin'].trainDict_))    
    ])
valid.X_ = validPipe.transform(valid.X_)

# drop features
valid.featureDropper(cols = ['Name','Cabin'])


In [None]:
# print new columns
for col in valid.X_.columns:
    if col not in valid.featureByDtype_['categorical'] and col not in valid.featureByDtype_['continuous']:
        print(col)
        

In [None]:
# append new continuous features
for col in ['FamilySize']:
    valid.featureByDtype_['continuous'].append(col)

# append new categorical features
for col in ['AgeCustomBin','AgePercBin','FarePercBin','FamilySize','FamilySizeCustomBin','Title','CabinQuarter']:
    valid.featureByDtype_['categorical'].append(col)


## Encoding

<a id = 'Encoding'></a>

### Evaluate

<a id = 'Evaluate2'></a>

In [None]:
# counts of unique values in training data string columns
train.X_[train.featureByDtype_['categorical']].apply(pd.Series.nunique, axis = 0)


In [None]:
# print unique values in each categorical columns
for col in train.X_[train.featureByDtype_['categorical']]:
    try:
        print(col, np.unique(train.X_[col]))
    except:
        pass


In [None]:
# counts of unique values in validation data string columns
valid.X_[valid.featureByDtype_['categorical']].apply(pd.Series.nunique, axis = 0)


In [None]:
# print unique values in each categorical columns
for col in valid.X_[valid.featureByDtype_['categorical']]:
    if col not in ['Name','Cabin']:
        print(col, np.unique(valid.X_[col]))


In [None]:
# 
for col in train.featureByDtype_['categorical']:
    if col not in ['Name','Cabin']:
        trainValues = train.X_[col].unique()
        validValues = valid.X_[col].unique()

        trainDiff = set(trainValues) - set(validValues)
        validDiff = set(validValues) - set(trainValues)

        if len(trainDiff) > 0 or len(validDiff) > 0:
            print('\n\n*** ' + col)
            print('Value present in training data, not in validation data')
            print(trainDiff)
            print('Value present in validation data, not in training data')
            print(validDiff)


Pclass [1 2 3] - ordinal

Sex ['female' 'male'] - nominal

Embarked ['C' 'Q' 'S'] - nominal

HasCabin [0 1] - nominal

Title [0 1 2 3] - nominal

CabinQuarter ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'X'] - nominal

### Training

<a id = 'Training2'></a>

In [None]:
### ordinal columns
ordCatCols = {
    'Pclass' : {1 : 1, 2 : 2, 3 : 3}
    }

# encode categorical columns
nomCatCols = ['Embarked','Sex','CabinQuarter','Title']
trainPipe = pipeline.Pipeline([
    ('encodeOrdinal', train.CustomOrdinalEncoder(encodings = ordCatCols))    
    ,('dummyNominal', train.Dummies(cols = nomCatCols, dropFirst = True))
    ])
train.X_ = trainPipe.transform(train.X_)
train.X_[:5]


### Validation

<a id = 'Validation2'></a>

In [None]:
# encode categorical columns
validPipe = pipeline.Pipeline([
    ('encodeOrdinal', valid.CustomOrdinalEncoder(encodings = ordCatCols))    
    ,('dummyNominal', valid.Dummies(cols = nomCatCols, dropFirst = False))
    ,('levels', valid.MissingDummies(trainCols = train.X_.columns))    
    ])
valid.X_ = validPipe.transform(valid.X_)
valid.X_[:5]


## Transformation

<a id = 'Transformation'></a>

### Evaluate

<a id = 'Evaluate4'></a>

In [None]:
# evaluate skew of continuous features - Train
train.skewSummary()


In [None]:
# evaluate skew of continuous features - Validation
valid.skewSummary()


### Training

<a id = 'Training4'></a>

In [None]:
#
trainPipe = pipeline.Pipeline([
    ('skew', train.SkewTransform(cols = train.featureByDtype_['continuous'], skewMin = 0.75, pctZeroMax = 1.0))
    ])
train.X_ = trainPipe.transform(train.X_)
train.skewSummary()


### Validation

<a id = 'Validation4'></a>

In [None]:
#
validPipe = pipeline.Pipeline([
    ('skew', valid.SkewTransform(train = False, trainDict = trainPipe.named_steps['skew'].colValueDict_))
    ])
valid.X_ = validPipe.transform(valid.X_)
valid.skewSummary()


## Outliers (final)


<a id = 'Outliers-final'></a>

### Training

<a id = 'Training6'></a>

In [None]:
trainPipe = pipeline.Pipeline([
    ('outlier', train.OutlierIQR(outlierCount = 5, iqrStep = 1.5, features = train.X_.columns, dropOutliers = False))     
    ])
train.X_ = trainPipe.transform(train.X_)

iqrOutliers = np.array(sorted(trainPipe.named_steps['outlier'].outliers_))
# train.y_ = np.delete(train.y_, trainPipe.named_steps['outlier'].outliers_)
iqrOutliers


In [None]:
clf = ensemble.IsolationForest(behaviour = 'new'
                        ,max_samples = train.X_.shape[0]
                        ,random_state = 0
                        ,contamination = 0.01
                        )
clf.fit(train.X_[train.X_.columns])
preds = clf.predict(train.X_[train.X_.columns])
# np.unique(preds, return_counts = True)

mask = np.isin(preds, -1)  # np.in1d if np.isin is not available
ifOutliers = np.where(mask)
ifOutliers


In [None]:
import eif as iso
if_eif = iso.iForest(train.X_.values
                 ,ntrees = 100
                 ,sample_size = 256
                 ,ExtensionLevel = 1
                )

# calculate anomaly scores
anomalies_ratio = 0.01
anomaly_scores = if_eif.compute_paths(X_in = train.X_.values)
anomaly_scores_sorted = np.argsort(anomaly_scores)
eifOutliers = np.array(anomaly_scores_sorted[-int(np.ceil(anomalies_ratio * train.X_.shape[0])):])
eifOutliers


In [None]:
#
from functools import reduce
# reduce(np.intersect1d, (iqrOutliers, ifOutliers, eifOutliers))
reduce(np.intersect1d, (ifOutliers, eifOutliers))


# Data evaluation

<a id = 'Data evaluation'></a>

## Feature importance

<a id = 'Feature-importance'></a>

In [None]:
# feature importance summary table
featureImp = train.featureImportanceSummary()
featureImp


## Rationality

<a id = 'Rationality'></a>

In [None]:
# percent difference summary
dfDiff = abs((((valid.X_.describe() + 1) - (train.X_.describe() + 1)) / (train.X_.describe() + 1)) * 100)
dfDiff = dfDiff[dfDiff.columns].replace({0 : np.nan})
dfDiff[dfDiff < 0] = np.nan
dfDiff = dfDiff.fillna('')
display(dfDiff)
display(train.X_.describe())
display(valid.X_.describe())


## Value override

<a id = 'Value override'></a>

In [None]:
# change clearly erroneous value to what it probably was
# exploreValid.X_['GarageYrBlt'].replace({2207 : 2007}, inplace = True)


## Continuous feature EDA

<a id = 'Continuous-feature-EDA3'></a>

### Univariate & feature vs. target

<a id = 'Univariate-&-feature-vs.-target3'></a>

In [None]:
# Continuous features
train.edaCatTargetNumFeat()


### Correlation

<a id = 'Correlation3'></a>

#### Correlation (top vs. target)

<a id = 'Correlation-top-vs-target3'></a>

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 15)
ax = p.makeCanvas()
p.qpCorrHeatmapRefine(df = train.X_
                      ,target = train.y_
                      ,targetLabel = train.target[0]
                      ,cols = None
                      ,annot = True
                      ,thresh = 0.25
                      ,ax = ax
                    )


# Modeling

<a id = 'Modeling'></a>

## Prepare training data

<a id = 'Prepare-training-data'></a>

In [None]:
importlib.reload(mlm.model.tune.bayesianOptimSearch)
importlib.reload(mlm.model.tune.stack)
importlib.reload(mlm)

### import training data
dfTrain = pd.read_csv('data/train.csv')
train = mlm.Machine(data = dfTrain
                  ,target = ['Survived']
                  ,removeFeatures = ['PassengerId','Ticket']                      
                  ,overrideCat = ['Pclass','SibSp','Parch']
                  ,targetType = 'categorical'
                )

### feature engineering
# Parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(',')[1].split('.')[0].strip() for i in train.X_['Name']]
train.X_['Title'] = pd.Series(title)
train.X_['Title'] = train.X_['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
                                            ,'Rare')
train.X_['Title'] = train.X_['Title'].map({'Master' : 0, 'Miss' : 1, 'Ms' : 1 , 'Mme' : 1, 'Mlle' : 1, 'Mrs' : 1, 'Mr' : 2, 'Rare' : 3})

# Distill cabin feature
train.X_['CabinQuarter'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in train.X_['Cabin']])

# Family size features
train.X_['FamilySize'] = train.X_['SibSp'] + train.X_['Parch'] + 1

# custom bin specifications
customBinDict = {'Age' : [16, 32, 48, 64]
                 ,'FamilySize' : [1, 2, 4]
          }
# categorical column specifications
ordCatCols = {
    'Pclass' : {1 : 1, 2 : 2, 3 : 3}
    }
nomCatCols = ['Embarked','Sex','CabinQuarter','Title']

# remove outliers
outliers = np.array([27, 88, 258, 311, 341, 438, 679, 737, 742])
train.X_ = train.X_.drop(train.X_.index[outliers])
train.y_ = np.delete(train.y_, outliers)

### pipeline
trainPipe = pipeline.Pipeline([
    ('imputeMedian', train.ContextImputer(nullCol = 'Age', contextCol = 'Parch', strategy = 'median'))     
    ,('imputeMode', train.ModeImputer(cols = ['Embarked']))
    ,('customBin', train.CustomBinner(customBinDict = customBinDict))
    ,('percentileBin', train.PercentileBinner(cols = ['Age','Fare'], percs = [25, 50, 75]))    
    ,('encodeOrdinal', train.CustomOrdinalEncoder(encodings = ordCatCols))    
    ,('dummyNominal', train.Dummies(cols = nomCatCols, dropFirst = True))
    ,('skew', train.SkewTransform(cols = train.featureByDtype_['continuous'], skewMin = 0.75, pctZeroMax = 1.0))    
    ])
train.X_ = trainPipe.transform(train.X_)

# drop features
train.featureDropper(cols = ['Name','Cabin'])


## Prepare validation data

<a id = 'Prepare-validation-data'></a>

In [None]:
### import valid data
dfValid = pd.read_csv('data/test.csv')
valid = mlm.Machine(data = dfValid
                  ,removeFeatures = ['PassengerId','Ticket']                      
                  ,overrideCat = ['Pclass','SibSp','Parch']
                )

### feature engineering
# Parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(',')[1].split('.')[0].strip() for i in valid.X_['Name']]
valid.X_['Title'] = pd.Series(title)
valid.X_['Title'] = valid.X_['Title'].replace(['Lady','the Countess','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
                                            ,'Rare')
valid.X_['Title'] = valid.X_['Title'].map({'Master' : 0, 'Miss' : 1, 'Ms' : 1 , 'Mme' : 1, 'Mlle' : 1, 'Mrs' : 1, 'Mr' : 2, 'Rare' : 3})

# Distill cabin feature
valid.X_['CabinQuarter'] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in valid.X_['Cabin']])

# additional features
valid.X_['FamilySize'] = valid.X_['SibSp'] + valid.X_['Parch'] + 1

### pipeline
validPipe = pipeline.Pipeline([
    ('imputeMedian', valid.ContextImputer(nullCol = 'Age', contextCol = 'Parch', train = False, trainDf = trainPipe.named_steps['imputeMedian'].fillDf))
    ,('imputeMedian2', valid.NumericalImputer(cols = ['Fare','Age'], strategy = 'median'))    
    ,('customBin', valid.CustomBinner(customBinDict = customBinDict))
    ,('percentileBin', valid.PercentileBinner(train = False, trainDict = trainPipe.named_steps['percentileBin'].trainDict_))   
    ,('encodeOrdinal', valid.CustomOrdinalEncoder(encodings = ordCatCols))    
    ,('dummyNominal', valid.Dummies(cols = nomCatCols, dropFirst = False))
    ,('levels', valid.MissingDummies(trainCols = train.X_.columns))    
    ,('skew', valid.SkewTransform(train = False, trainDict = trainPipe.named_steps['skew'].colValueDict_))    
    ])
valid.X_ = validPipe.transform(valid.X_)


## GridSearch

<a id = 'GridSearch'></a>

In [None]:
 # parameter space
allSpace = {
            'lightgbm.LGBMClassifier' : {
                'class_weight' : hp.choice('class_weight', [None])
                ,'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.7)
                ,'boosting_type' : hp.choice('boosting_type', ['dart'])                
                ,'subsample': hp.uniform('subsample', 0.5, 1)
                ,'learning_rate' : hp.uniform('learning_rate', 0.15, 0.25)
                ,'max_depth' : hp.choice('max_depth', np.arange(4, 20, dtype = int))
                ,'min_child_samples' : hp.quniform('min_child_samples', 50, 150, 5)
                ,'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'num_leaves': hp.quniform('num_leaves', 30, 70, 1)
                ,'reg_alpha': hp.uniform('reg_alpha', 0.75, 1.25)
                ,'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0)
                ,'subsample_for_bin': hp.quniform('subsample_for_bin', 100000, 350000, 20000)
            }
            ,'linear_model.LogisticRegression' : {
                'C': hp.uniform('C', 0.04, 0.1)
                ,'penalty': hp.choice('penalty', ['l1'])
            }
            ,'xgboost.XGBClassifier' : {
                'colsample_bytree' : hp.uniform('colsample_bytree', 0.4, 0.7)
                ,'gamma' : hp.quniform('gamma', 0.0, 10, 0.05)
                ,'learning_rate' : hp.quniform('learning_rate', 0.01, 0.2, 0.01)
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 15, dtype = int))
                ,'min_child_weight': hp.quniform ('min_child_weight', 2.5, 7.5, 1)
                ,'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'subsample': hp.uniform ('subsample', 0.4, 0.7)
            }
            ,'ensemble.RandomForestClassifier' : {
                'bootstrap' : hp.choice('bootstrap', [True, False])
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 10, dtype = int))
                ,'n_estimators' : hp.choice('n_estimators', np.arange(100, 8000, 10, dtype = int))
                ,'max_features' : hp.choice('max_features', ['sqrt'])
                ,'min_samples_split' : hp.choice('min_samples_split', np.arange(15, 25, dtype = int))
                ,'min_samples_leaf' : hp.choice('min_samples_leaf', np.arange(2, 20, dtype = int))
            }
            ,'ensemble.GradientBoostingClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 11, dtype = int))
                ,'max_features' : hp.choice('max_features', ['sqrt'])    
                ,'learning_rate' : hp.quniform('learning_rate', 0.01, 0.09, 0.01)
                ,'loss' : hp.choice('loss', ['deviance','exponential'])    
                ,'min_samples_split' : hp.choice('min_samples_split', np.arange(2, 40, dtype = int))
                ,'min_samples_leaf' : hp.choice('min_samples_leaf', np.arange(2, 40, dtype = int))
            }
            ,'ensemble.AdaBoostClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'learning_rate' : hp.quniform('learning_rate', 0.1, 0.25, 0.01)
                ,'algorithm' : hp.choice('algorithm', ['SAMME'])                    
            }
            ,'naive_bayes.BernoulliNB' : {
                'alpha' :  hp.uniform('alpha', 0.01, 2)
            }
            ,'ensemble.BaggingClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'max_samples' : hp.uniform('max_samples', 0.01, 0.3)                    
            }
            ,'ensemble.ExtraTreesClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 15, dtype = int))
                ,'min_samples_split' : hp.choice('min_samples_split', np.arange(4, 30, dtype = int))
                ,'min_samples_leaf' : hp.choice('min_samples_leaf', np.arange(2, 20, dtype = int))
                ,'max_features' : hp.choice('max_features', ['auto'])
                ,'criterion' : hp.choice('criterion', ['entropy'])
            }
            ,'svm.SVC' : {
                'C' : hp.uniform('C', 4, 15)
                ,'decision_function_shape' : hp.choice('decision_function_shape', ['ovr'])
                ,'gamma' : hp.uniform('gamma', 0.00000001, 1.5)
            }
            ,'neighbors.KNeighborsClassifier' : {
                'algorithm' : hp.choice('algorithm', ['ball_tree','brute'])
                ,'n_neighbors' : hp.choice('n_neighbors', np.arange(1, 15, dtype = int))
                ,'weights' : hp.choice('weights', ['uniform'])
            }
}


In [None]:
# execute bayesian optimization grid search
analysis = 'titanic'
train.execBayesOptimSearch(allSpace = allSpace
                           ,resultsDir = 'data/{}_hyperopt_{}.csv'.format(rundate, analysis)
#                            ,model = ''
                           ,X = train.X_
                           ,y = train.y_
                           ,scoring = 'accuracy'
                           ,n_folds = 8
                           ,n_jobs = 16
                           ,iters = 1000
                           ,verbose = 0)


### Evaluation

In [None]:
# read scores summary table
resultsDf = pd.read_csv('data/20190423_hyperopt_titanic.csv', na_values = 'nan')
results = train.unpackParams(resultsDf)


In [None]:
# loss plot
train.lossPlot(resultsDf = results)


In [None]:
# estimator parameter plots
train.paramPlot(results = results, allSpace = allSpace, nIter = 100)


In [None]:
sampleSpace = {
#             'param': hp.uniform('param', np.log(0.4), np.log(0.6))
            '' : 0.000001 + hp.uniform('gamma', 0.000001, 10)
#             'param2': hp.loguniform('param2', np.log(0.001), np.log(0.01))
        }

train.samplePlot(sampleSpace, 1000)


<a id = 'Evaluation'></a>

### Model explanability

https://www.kaggle.com/learn/machine-learning-explainability

<a id = 'Feature-importance'></a>

#### Permutation importance

<a id = 'Permutation-importance'></a>

In [None]:
# permutation importance
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())


#### Partial plots

<a id = 'Partial-plots'></a>

In [None]:
#
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature='Goal Scored')

# plot it
pdp.pdp_plot(pdp_goals, 'Goal Scored')
plt.show()


In [None]:
feature_to_plot = 'Distance Covered (Kms)'
pdp_dist = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)

pdp.pdp_plot(pdp_dist, feature_to_plot)
plt.show()

In [None]:
# Build Random Forest model
rf_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)

pdp_dist = pdp.pdp_isolate(model=rf_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)

pdp.pdp_plot(pdp_dist, feature_to_plot)
plt.show()

In [None]:
# 2D plots
# Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot
features_to_plot = ['Goal Scored', 'Distance Covered (Kms)']
inter1  =  pdp.pdp_interact(model=tree_model, dataset=val_X, model_features=feature_names, features=features_to_plot)

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
plt.show()

#### SHAP values

<a id = 'SHAP-values'></a>

In [None]:
#
row_to_show = 5
data_for_prediction = val_X.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


my_model.predict_proba(data_for_prediction_array)

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)


In [None]:
# use Kernel SHAP to explain test set predictions
k_explainer = shap.KernelExplainer(my_model.predict_proba, train_X)
k_shap_values = k_explainer.shap_values(data_for_prediction)
shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction)


In [None]:
shap.DeepExplainer

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(val_X)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values[1], val_X)

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# calculate shap values. This is what we will plot.
shap_values = explainer.shap_values(X)

# make plot.
shap.dependence_plot('Ball Possession %', shap_values[1], X, interaction_index="Goal Scored")


## Stacking

<a id = 'Stacking'></a>

### Primary models

<a id = 'Primary-models'></a>

In [None]:
resultsDf[resultsDf['estimator'] == 'xgboost.XGBClassifier'].sort_values(['mean'], ascending = [False])[:5]

In [None]:
def topParamSelector(resultsDf, num):
    models = {}
    for estimator in resultsDf['estimator'].unique():
        estDf = resultsDf[resultsDf['estimator'] == estimator].sort_values(['mean'], ascending = [False])['iteration'][:num]
        models[estimator] = estDf.values.tolist()
    return models
models = topParamSelector(resultsDf = resultsDf, num = 1)
models


In [None]:
#
oofTrain, oofValid, columns = train.modelStacker(models = models
                                                 ,resultsDf = resultsDf
                                                 ,XTrain = train.X_.values
                                                 ,yTrain = train.y_
                                                 ,XValid = valid.X_.values
                                                 ,nFolds = 2
                                                 ,nJobs = 16)
                                                             


In [None]:
# view correlations of predictions
sns.set_style('whitegrid')
p = qp.plotter.QuickPlot(fig = plt.figure(), chartProp = 15)
ax = p.makeCanvas(position = 111)
p.qpCorrHeatmap(df = pd.DataFrame(oofTrain, columns = columns)
                ,annot = True
                ,ax = ax
                ,vmin = 0
               )


### Meta model

<a id = 'Meta-model'></a>

In [None]:
importlib.reload(mlm.model.tune.bayesianOptimSearch)
importlib.reload(mlm.model.tune.stack)
importlib.reload(mlm)
# parameter space


In [None]:
 # parameter space
allSpace = {
            'lightgbm.LGBMClassifier' : {
                'class_weight' : hp.choice('class_weight', [None])
                ,'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.7)
                ,'boosting_type' : hp.choice('boosting_type', ['dart'])                
                ,'subsample': hp.uniform('subsample', 0.5, 1)
                ,'learning_rate' : hp.uniform('learning_rate', 0.15, 0.25)
                ,'max_depth' : hp.choice('max_depth', np.arange(4, 20, dtype = int))
                ,'min_child_samples' : hp.quniform('min_child_samples', 50, 150, 5)
                ,'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'num_leaves': hp.quniform('num_leaves', 30, 70, 1)
                ,'reg_alpha': hp.uniform('reg_alpha', 0.75, 1.25)
                ,'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0)
                ,'subsample_for_bin': hp.quniform('subsample_for_bin', 100000, 350000, 20000)
            }
            ,'linear_model.LogisticRegression' : {
                'C': hp.uniform('C', 0.04, 0.1)
                ,'penalty': hp.choice('penalty', ['l1'])
            }
            ,'xgboost.XGBClassifier' : {
                'colsample_bytree' : hp.uniform('colsample_bytree', 0.4, 0.7)
                ,'gamma' : hp.quniform('gamma', 0.0, 10, 0.05)
                ,'learning_rate' : hp.quniform('learning_rate', 0.01, 0.2, 0.01)
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 15, dtype = int))
                ,'min_child_weight': hp.quniform ('min_child_weight', 2.5, 7.5, 1)
                ,'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'subsample': hp.uniform ('subsample', 0.4, 0.7)
            }
            ,'ensemble.RandomForestClassifier' : {
                'bootstrap' : hp.choice('bootstrap', [True, False])
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 10, dtype = int))
                ,'n_estimators' : hp.choice('n_estimators', np.arange(100, 8000, 10, dtype = int))
                ,'max_features' : hp.choice('max_features', ['sqrt'])
                ,'min_samples_split' : hp.choice('min_samples_split', np.arange(15, 25, dtype = int))
                ,'min_samples_leaf' : hp.choice('min_samples_leaf', np.arange(2, 20, dtype = int))
            }
            ,'ensemble.GradientBoostingClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 11, dtype = int))
                ,'max_features' : hp.choice('max_features', ['sqrt'])    
                ,'learning_rate' : hp.quniform('learning_rate', 0.01, 0.09, 0.01)
                ,'loss' : hp.choice('loss', ['deviance','exponential'])    
                ,'min_samples_split' : hp.choice('min_samples_split', np.arange(2, 40, dtype = int))
                ,'min_samples_leaf' : hp.choice('min_samples_leaf', np.arange(2, 40, dtype = int))
            }
            ,'ensemble.AdaBoostClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'learning_rate' : hp.quniform('learning_rate', 0.1, 0.25, 0.01)
                ,'algorithm' : hp.choice('algorithm', ['SAMME'])                    
            }
            ,'naive_bayes.BernoulliNB' : {
                'alpha' :  hp.uniform('alpha', 0.01, 2)
            }
            ,'ensemble.BaggingClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'max_samples' : hp.uniform('max_samples', 0.01, 0.3)                    
            }
            ,'ensemble.ExtraTreesClassifier' : {
                'n_estimators' : hp.choice('n_estimators', np.arange(100, 4000, 10, dtype = int))
                ,'max_depth' : hp.choice('max_depth', np.arange(2, 15, dtype = int))
                ,'min_samples_split' : hp.choice('min_samples_split', np.arange(4, 30, dtype = int))
                ,'min_samples_leaf' : hp.choice('min_samples_leaf', np.arange(2, 20, dtype = int))
                ,'max_features' : hp.choice('max_features', ['auto'])
                ,'criterion' : hp.choice('criterion', ['entropy'])
            }
            'svm.SVC' : {
                'C' : hp.uniform('C', 0.00000001, 15)
                ,'decision_function_shape' : hp.choice('decision_function_shape', ['ovr','ovo'])
                ,'gamma' : hp.uniform('gamma', 0.00000001, 1.5)
            }
            ,'neighbors.KNeighborsClassifier' : {
                'algorithm' : hp.choice('algorithm', ['ball_tree','brute'])
                ,'n_neighbors' : hp.choice('n_neighbors', np.arange(1, 15, dtype = int))
                ,'weights' : hp.choice('weights', ['uniform'])
            }
}


In [None]:
# execute bayesian optimization grid search
train.execBayesOptimSearch(allSpace = allSpace
                           ,resultsDir = 'data/{}_hyperopt_meta_{}_2.csv'.format(rundate, analysis)
                           ,X = oofTrain
                           ,y = train.y_
                           ,scoring = 'accuracy'
                           ,n_folds = 8
                           ,n_jobs = 8
                           ,iters = 3000
                           ,verbose = 0)


In [None]:
# read scores summary table
resultsMetaDf = pd.read_csv('data/20190423_hyperopt_meta_titanic_2.csv', na_values = 'nan')
resultsMeta = train.unpackParams(resultsMetaDf)


In [None]:
# loss plot
train.lossPlot(resultsDf = resultsMeta)


In [None]:
# estimator parameter plots
train.paramPlot(results = resultsMeta, allSpace = allSpace, nIter = 100)


# Submission

<a id = 'Submission'></a>

## Standard

<a id = 'Standard'></a>

In [None]:
## standard model fit and predict

# select estimator and iteration
# estimator = 'ensemble.RandomForestClassifier'
# iteration = 1955
# estimator = 'xgboost.XGBClassifier'
# iteration = 2097
estimator = 'lightgbm.LGBMClassifier'
iteration = 2264

# extract params and instantiate model
params = train.paramExtractor(resultsDf = resultsDf, estimator = estimator, iteration = iteration)
model = eval('{0}(**{1})'.format(estimator, params))

# fit model and make predictions
model.fit(train.X_, train.y_)
yPred = model.predict(valid.X_)


In [None]:
# generate prediction submission file
my_submission = pd.DataFrame({'PassengerId': dfValid.PassengerId, 'Survived': yPred})
my_submission.to_csv('data/submission.csv', index = False)


## Stack

<a id = 'Stack'></a>

In [None]:
resultsMetaDf.sort_values(['mean'], ascending = [False])[:5]

In [None]:
# best second level learning model
# estimator = 'xgboost.XGBClassifier'
# estimator = 'ensemble.RandomForestClassifier'
# estimator = 'ensemble.GradientBoostingClassifier'
estimator = 'svm.SVC'

iteration = 2436

# extract params and instantiate model
params = train.paramExtractor(resultsDf = resultsMetaDf, estimator = estimator, iteration = iteration)
model = eval('{0}(**{1})'.format(estimator, params))

model.fit(oofTrain, train.y_)
yPred = model.predict(oofValid)
print(sum(yPred))

In [None]:
# generate prediction submission file
my_submission = pd.DataFrame({'PassengerId': dfValid.PassengerId, 'Survived': yPred})
my_submission.to_csv('data/submission.csv', index = False)


In [None]:
'https://medium.com/@rrfd/boosting-bagging-and-stacking-ensemble-methods-with-sklearn-and-mlens-a455c0c982de'

from itertools import combinations
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
names = ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier']
def zip_stacked_classifiers(*args):
    to_zip = []
    for arg in args:
        combined_items = sum([map(list, combinations(arg, i)) for i in range(len(arg) + 1)], [])
        combined_items = filter(lambda x: len(x) > 0, combined_items)
        to_zip.append(combined_items)
    
    return zip(to_zip[0], to_zip[1])
stacked_clf_list = zip_stacked_classifiers(clf_array, names)
best_combination = [0.00, ""]
for clf in stacked_clf_list:
    
    ensemble = SuperLearner(scorer = accuracy_score, 
                            random_state = seed, 
                            folds = 10)
    ensemble.add(clf[0])
    ensemble.add_meta(lr)
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_test)
    accuracy = accuracy_score(preds, y_test)
    
    if accuracy > best_combination[0]:
        best_combination[0] = accuracy
        best_combination[1] = clf[1]
    
    print("Accuracy score: {:.3f} {}").format(accuracy, clf[1])
print("\nBest stacking model is {} with accuracy of: {:.3f}").format(best_combination[1], best_combination[0])