__Kaggle competition - house prices__

1. [Import](#Import)
    1. [Tools](#Tools)
    1. [Data](#Data)    
1. [EDA](#EDA)
    1. [Categorical feature EDA](#Categorical-feature-EDA)
    1. [Numeric feature EDA](#numeric-feature-EDA)
    1. [Faceting](#Faceting)
    1. [Target variable evaluation](#Target-variable-evaluation)    
1. [Data preparation](#Data-preparation)
    1. [Missing data](#Missing-data)
    1. [Engineering](#Engineering)
    1. [Encoding](#Encoding)
    1. [Transformation](#Transformation)
        1. [Polynomial features](#Polynomial-features)
        1. [Skew](#Skew)
        1. [Scale](#Scale)
    1. [Outliers](#Outliers)
1. [Feature importance](#Feature-importance)    
1. [Modeling](#Modeling)
    1. [Data preparation](#Data-preparation-1)
    1. [Bayesian hyper-parameter optimization](#Bayesian-hyper-parameter-optimization)
    1. [Model performance evaluation - standard models](#Model-performance-evaluation-standard-models)
    1. [Model explanability](#Model-explanability)
    1. [Submission - standard models](#Submission-standard-models)
1. [Stacking](#Stacking)
    1. [Primary models](#Primary-models)
    1. [Meta model](#Meta-model)                
    1. [Model performance evaluation - stacked models](#Model-performance-evaluation-stacked-models)
    1. [Submission - stacked models](#Submission-stacked-models)    

# Import

<a id = 'Import'></a>

## Tools

<a id = 'Tools'></a>

In [None]:
# standard libary and settings
import os
import sys
import importlib
import itertools
from functools import reduce
import time; rundate = time.strftime("%Y%m%d")

import warnings
warnings.simplefilter("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np
np.set_printoptions(threshold=np.inf, suppress=True)

import pandas as pd
pd.set_option("display.max_rows", 500); pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# modeling extensions
import sklearn.base as base
import sklearn.datasets as datasets
import sklearn.ensemble as ensemble
import sklearn.impute as impute
import sklearn.linear_model as linear_model
import sklearn.neighbors as neighbors
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing
import sklearn.svm as svm

from hyperopt import hp

import lightgbm
import xgboost

import eif
import shap
shap.initjs()
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

%matplotlib inline

try:
    #     import mlmachine as mlm
    #     from prettierplot.plotter import PrettierPlot
    #     import prettierplot.style as style
    import asdfasd
except ModuleNotFoundError:
    sys.path.append(
        "../../../mlmachine"
    ) if "../../../../mlmachine" not in sys.path else None
    sys.path.append(
        "../../../prettierplot"
    ) if "../../../../prettierplot" not in sys.path else None

    import mlmachine as mlm
    import mlmachine.data as data
    from mlmachine.features.preprocessing import (
        DataFrameSelector,
        PlayWithPandas,
        UnprocessedColumnAdder,
        ContextImputer,
        PandasFeatureUnion,
        DualTransformer,
    )
    from prettierplot.plotter import PrettierPlot
    import prettierplot.style as style
else:
    print(
        "This notebook relies on the libraries mlmachine and prettierplot. Please run:"
    )
    print("\tpip install mlmachine")
    print("\tpip install prettierplot")

## Data

<a id = 'Data'></a>

In [None]:
# load data and print dimensions
df_train, df_valid = data.housing()
# df_train = pd.read_csv("s3://tdp-ml-datasets/kaggle-housing/train.csv")
# df_valid = pd.read_csv("s3://tdp-ml-datasets/kaggle-housing/test.csv")

print("Training data dimensions: {}".format(df_train.shape))
print("Validation data dimensions: {}".format(df_valid.shape))

In [None]:
# display info and first 5 rows
df_train.info()
display(df_train[:5])

In [None]:
# review counts of different column types
df_train.dtypes.value_counts()

In [None]:
# load training data into mlmachine
train = mlm.Machine(
    data=df_train,
    target="SalePrice",
    remove_features=["Id", "MiscVal"],
    force_to_categorical=[
        "MSSubClass",
        "OverallQual",
        "OverallCond",
        "YearBuilt",
        "YearRemodAdd",
        "MoSold",
        "YrSold",
    ],
    target_type="numeric",
)
print(train.data.shape)

In [None]:
# load training data into mlmachine
valid = mlm.Machine(
    data=df_valid,
    remove_features=["Id", "MiscVal"],
    force_to_categorical=[
        "MSSubClass",
        "OverallQual",
        "OverallCond",
        "YearBuilt",
        "YearRemodAdd",
        "MoSold",
        "YrSold",
    ],
)
print(valid.data.shape)

# EDA

<a id = 'EDA'></a>

## Categorical feature EDA

<a id = 'Categorical-feature-EDA'></a>

##### Univariate & feature vs. target

In [None]:
# categorical features
for feature in train.feature_type["categorical"]:
    train.eda_num_target_cat_feat(feature=feature, level_count_cap=50)

## Numeric feature EDA

<a id = 'numeric-feature-EDA'></a>

##### Univariate & feature vs. target

In [None]:
# numeric features
for feature in train.feature_type["numeric"]:
    train.eda_num_target_num_feat(feature=feature)

##### Correlation

###### Correlation (all samples)

In [None]:
# correlation heat map
p = PrettierPlot(chart_prop=25)
ax = p.make_canvas()
p.pretty_corr_heatmap(df=train.data, ax=ax)

###### Correlation (top vs. target)

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plot_orientation='tall',chart_prop=15)
ax = p.make_canvas()
p.pretty_corr_heatmap_target(df=train.data, target=train.target, thresh=0.6, annot = True, ax=ax)

> Remarks - There are three pairs of highly correlated features:
    - 'GarageArea' and 'GarageCars'
    - 'TotRmsAbvGrd' and 'GrLivArea'
    - '1stFlrSF' and 'TotalBsmtSF
This makes sense, given what each feature represents and how each pair items relate to each other. We likely only need one feature from each pair.

##### Pair plot

In [None]:
# pair plot
p = PrettierPlot(chart_prop=10)
p.pretty_pair_plot(
    df=train.data,
    cols=[
        "LotFrontage",
        "LotArea",
        "MasVnrArea",
        "BsmtFinSF1",
        "BsmtFinSF2",
        "BsmtUnfSF",
        "TotalBsmtSF",
        "1stFlrSF",
        "2ndFlrSF",
        "GrLivArea",
        "TotRmsAbvGrd",
        "GarageYrBlt",
        "GarageArea",
        "WoodDeckSF",
        "OpenPorchSF",
    ],
    diag_kind="auto",
)

## Faceting

<a id = 'Faceting'></a>

##### Categorical by categorical

##### Categorical by numeric

## Target variable evaluation

<a id = 'Target-variable-evaluation'></a>

In [None]:
# evaluate distribution of target variable
train.edaTransformInitial(data=train.target, name=train.target.name)
train.edaTransformLog1(data=train.target, name=train.target.name)

In [None]:
# log + 1 transform target
train.target = np.log1p(train.target)

# Data preparation

<a id = 'Data-preparation'></a>

## Missing data

-__MCAR__ - Completely unsystematic missingness, completely unralted to any of the other variables. simple imputation of mean, median or mode is most acceptable for this type of missingness.

-__MAR__ - The nature of the missing data is related to observed data in other variables, not the missing data. The missing data is conditional on some other variable.  For example, men are more likely to tell you their weight than woemn. The missingness of weight has to do with gender.

-__MNAR__ - There is a relationship between the propensity of a value to be missing and its values. For example, the wealthiest people choosing not to state their income.



<a id = 'Missing-data'></a>

##### Training

In [None]:
# evaluate missing data
train.eda_missing_summary()

In [None]:
# missingno matrix
msno.matrix(train.data)

In [None]:
# missingno bar
msno.bar(train.data)

In [None]:
# missingno heatmap
msno.heatmap(train.data)

In [None]:
# missingno dendrogram
msno.dendrogram(train.data)

##### Validation

In [None]:
# evaluate missing data
valid.eda_missing_summary()

In [None]:
# missingno matrix
msno.matrix(valid.data)

In [None]:
# missingno bar
msno.bar(valid.data)

In [None]:
# missingno heatmap
msno.heatmap(valid.data)

In [None]:
# missingno dendrogram
msno.dendrogram(valid.data)

##### Training vs. validation


In [None]:
# compare feature with missing data
train.missing_col_compare(train=train.data, validation=valid.data)

##### Impute


In [None]:
# impute pipeline
categoricalConstant = ['GarageFinish', 'Alley', 'MasVnrType', 'GarageType', 'BsmtFinType1',
                       'BsmtCond', 'BsmtFinType2', 'BsmtQual', 'PoolQC', 'GarageCond',
                       'FireplaceQu', 'GarageQual', 'Fence', 'BsmtExposure', 'MiscFeature']
numericConstant = ["GarageYrBlt","MasVnrArea","BsmtUnfSF","GarageArea","BsmtFinSF1","TotalBsmtSF","BsmtFinSF2"]
categoricalMode = ["Electrical","Functional","SaleType","Exterior1st","MSZoning","Exterior2nd","KitchenQual","Utilities"]
numericMode = ["BsmtHalfBath", "GarageCars", "BsmtFullBath"]

impute_pipe = PandasFeatureUnion([
    ("catConstant", pipeline.make_pipeline(
        DataFrameSelector(categoricalConstant),
        PlayWithPandas(impute.SimpleImputer(strategy="constant", fill_value="Nonexistent"))
    )),
    ("numConstant", pipeline.make_pipeline(
        DataFrameSelector(numericConstant),
        PlayWithPandas(impute.SimpleImputer(strategy="constant", fill_value=0))
    )),
    ("catMode", pipeline.make_pipeline(
        DataFrameSelector(categoricalMode),
        PlayWithPandas(impute.SimpleImputer(strategy="most_frequent"))
    )),
    ("numMode", pipeline.make_pipeline(
        DataFrameSelector(numericMode),
        PlayWithPandas(impute.SimpleImputer(strategy="most_frequent"))
    )),
    ("LotFrontage", pipeline.make_pipeline(
        DataFrameSelector(["LotFrontage","Neighborhood"]),
        ContextImputer(null_col="LotFrontage", context_col="Neighborhood", strategy="mean")
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(["LotFrontage"] + categoricalConstant + numericConstant + categoricalMode + numericMode))),
    )),
])

train.data = impute_pipe.fit_transform(train.data)
valid.data = impute_pipe.transform(valid.data)

##### Validation

In [None]:
train.eda_missing_summary()

In [None]:
valid.eda_missing_summary()

## Engineering

<a id = 'Engineering'></a>

##### Training

In [None]:
# additional features
train.data["BsmtFinSF"] = train.data["BsmtFinSF1"] + train.data["BsmtFinSF2"]
train.data["TotalSF"] = (
    train.data["TotalBsmtSF"] + train.data["1stFlrSF"] + train.data["2ndFlrSF"]
)
train.feature_type_update()

In [None]:
# evaluate additional features
for feature in ["BsmtFinSF","TotalSF"]:
    train.eda_num_target_num_feat(feature=feature)

##### Validation

In [None]:
# additional features
valid.data["BsmtFinSF"] = valid.data["BsmtFinSF1"] + valid.data["BsmtFinSF2"]
valid.data["TotalSF"] = (
    valid.data["TotalBsmtSF"] + valid.data["1stFlrSF"] + valid.data["2ndFlrSF"]
)
valid.feature_type_update()

## Encoding

<a id = 'Encoding'></a>

##### Training

In [None]:
# counts of unique values in training data categorical columns
train.data[train.feature_type["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in train.data[train.feature_type["categorical"]]:
    print(col, np.unique(train.data[col]))

##### Validation

In [None]:
# counts of unique values in validation data string columns
valid.data[valid.feature_type["categorical"]].apply(pd.Series.nunique, axis=0)

In [None]:
# print unique values in each categorical columns
for col in valid.data[valid.feature_type["categorical"]]:
    print(col, np.unique(valid.data[col]))

##### Training vs. validation

In [None]:
# identify values that are present in the training data but not the validation data, and vice versa
for col in train.feature_type["categorical"]:
    train_values = train.data[col].unique()
    valid_values = valid.data[col].unique()

    trainDiff = set(train_values) - set(valid_values)
    valid_diff = set(valid_values) - set(train_values)

    if len(trainDiff) > 0 or len(valid_diff) > 0:
        print("\n\n*** " + col)
        print("Value present in training data, not in validation data")
        print(trainDiff)
        print("Value present in validation data, not in training data")
        print(valid_diff)

##### Encode


In [None]:
# nominal columns
nominal_columns = ["MSSubClass","MSZoning","LandContour","Neighborhood","Condition1","Condition2","BldgType",
    "HouseStyle","RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType","Foundation","Heating",
    "GarageType","Fence","SaleType","SaleCondition","MiscFeature",]
ordinal_columns = ["Street","Alley","LotShape","Utilities","LotConfig","LandSlope","ExterQual","ExterCond","BsmtQual",
    "BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","HeatingQC","CentralAir","Electrical","KitchenQual",
    "Functional","FireplaceQu","GarageFinish","GarageQual","GarageCond","PavedDrive","PoolQC",]

ordinal_encodings = [
    ["Grvl", "Pave"],  #  Street
    ["Nonexistent", "Grvl", "Pave"],  # Alley
    ["IR3", "IR2", "IR1", "Reg"],  # LotShape
    ["ELO", "NoSeWa", "NoSewr", "AllPub"],  # Utilities
    ["FR3", "FR2", "Corner", "Inside", "CulDSac"],  # LotConfig
    ["Sev", "Mod", "Gtl"],  # LandSlope
    ["Po", "Fa", "TA", "Gd", "Ex"],  # ExterQual
    ["Po", "Fa", "TA", "Gd", "Ex"],  # ExterCond
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # BsmtQual
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # BsmtCond
    ["Nonexistent", "No", "Mn", "Av", "Gd"],  # BsmtExposure
    ["Nonexistent", "Unf", "LwQ", "BLQ", "Rec", "ALQ", "GLQ"],  # BsmtFinType1
    ["Nonexistent", "Unf", "LwQ", "BLQ", "Rec", "ALQ", "GLQ"],  # BsmtFinType2
    ["Po", "Fa", "TA", "Gd", "Ex"],  # HeatingQC
    ["N", "Y"],  # CentralAir
    ["FuseP", "FuseF", "FuseA", "Mix", "SBrkr"],  # Electrical
    ["Po", "Fa", "TA", "Gd", "Ex"],  # KitchenQual
    ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],  # Functional
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # FireplaceQu
    ["Nonexistent", "Unf", "RFn", "Fin"],  # GarageFinish
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # GarageQual
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # GarageCond
    ["N", "P", "Y"],  # PavedDrive
    ["Nonexistent", "Fa", "TA", "Gd", "Ex"],  # PoolQC
]

# encode pipeline
encode_pipe = PandasFeatureUnion([
    ("ordinal", pipeline.make_pipeline(
        DataFrameSelector(ordinal_columns),
        PlayWithPandas(preprocessing.OrdinalEncoder(categories=ordinal_encodings)),
    )),
    ("nominal", pipeline.make_pipeline(
        DataFrameSelector(nominal_columns),
        PlayWithPandas(preprocessing.OneHotEncoder(handle_unknown="ignore")),
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(nominal_columns + ordinal_columns))),
    )),
])

train.data = encode_pipe.fit_transform(train.data)
valid.data = encode_pipe.transform(valid.data)

train.feature_type_update()
valid.feature_type_update()

## Transformation

<a id = 'Transformation'></a>

### Polynomial features


<a id = 'Polynomial-features'></a>

##### Value override

In [None]:
# change clearly erroneous value to what it probably was
valid.data["GarageYrBlt"].replace({2207: 2007}, inplace=True)

##### Transform

In [None]:
# polynomial pipe
polynomial_pipe = PandasFeatureUnion([
    ("polynomial", pipeline.make_pipeline(
        DataFrameSelector(train.feature_type["numeric"]),
        PlayWithPandas(preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False))
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(train.feature_type["numeric"]))),
    )),
])

train.data = polynomial_pipe.fit_transform(train.data)
valid.data = polynomial_pipe.transform(valid.data)

train.feature_type_update()
valid.feature_type_update()

### Skew


<a id = 'Skew'></a>

##### Training

In [None]:
# evaluate skew of numeric features - validation data
train.skew_summary()

##### Validation

In [None]:
# evaluate skew of numeric features - training data
valid.skew_summary()

##### Transform


In [None]:
# # skew pipe
# skew_pipe = PandasFeatureUnion([
#     ("skew", pipeline.make_pipeline(
#         DataFrameSelector(train.feature_type["numeric"]),
#         DualTransformer(),
#     )),
#     ("diff", pipeline.make_pipeline(
#         DataFrameSelector(list(set(train.data.columns).difference(train.feature_type["numeric"]))),
#     )),
# ])

# train.data = skew_pipe.fit_transform(train.data)
# valid.data = skew_pipe.transform(valid.data)

# train.feature_type_update()
# valid.feature_type_update()

### Scale


<a id = 'Scale'></a>

##### Transform

In [None]:
# scae pipe
scale_pipe = PandasFeatureUnion([
    ("scale", pipeline.make_pipeline(
        DataFrameSelector(train.feature_type["numeric"]),
        PlayWithPandas(preprocessing.StandardScaler())
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(train.feature_type["numeric"]))),
    )),
])

train.data = scale_pipe.fit_transform(train.data)
valid.data = scale_pipe.transform(valid.data)

train.feature_type_update()
valid.feature_type_update()

## Outliers

<a id = 'Outliers'></a>

In [None]:
# identify outliers using IQR
trainPipe = pipeline.Pipeline([
    ("outlier",train.OutlierIQR(
                outlier_count=20,
                iqr_step=1.5,
                features=train.feature_type["numeric"],
                drop_outliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
iqr_outliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(iqr_outliers)

In [None]:
# identify outliers using Isolation Forest
clf = ensemble.IsolationForest(
    behaviour="new", max_samples=train.data.shape[0], random_state=0, contamination=0.01
)
clf.fit(train.data[train.data.columns])
preds = clf.predict(train.data[train.data.columns])

# evaluate index values
mask = np.isin(preds, -1)
if_outliers = np.array(train.data[mask].index)
print(if_outliers)

In [None]:
# identify outliers using extended isolation forest
trainPipe = pipeline.Pipeline([
    ("outlier",train.ExtendedIsoForest(
                cols=train.feature_type["numeric"],
                n_trees=100,
                sample_size=256,
                ExtensionLevel=1,
                anomalies_ratio=0.03,
                drop_outliers=False,))
    ])
train.data = trainPipe.transform(train.data)

# capture outliers
eif_outliers = np.array(sorted(trainPipe.named_steps["outlier"].outliers_))
print(eif_outliers)

In [None]:
# identify outliers that are identified in multiple algorithms
outliers = reduce(np.intersect1d, (iqr_outliers, if_outliers, eif_outliers))
# outliers = reduce(np.intersect1d, (if_outliers, eif_outliers))
print(outliers)

In [None]:
# review outlier identification summary
outlier_summary = train.outlier_summary(iqr_outliers=iqr_outliers,
                             if_outliers=if_outliers,
                             eif_outliers=eif_outliers
                            )
outlier_summary[outlier_summary["Count"] >= 3]

In [None]:
# capture index values of known outliers
knownOutliers = (
    train.data[train.data["LotArea"] > 60000].index.values.tolist()
    + train.data[train.data["LotFrontage"] > 300].index.values.tolist()
    + train.data[train.data["GrLivArea"] > 4000].index.values.tolist()
)
knownOutliers = sorted(set(knownOutliers))
print(knownOutliers)

# index of known outliers and outliers identified with the known outliers removed
outliers = [
    53,
    185,
    197,
    437,
    492,
    762,
    796,
    821,
    847,
    1161,
    1221,
    1318,
    1376,
    249,
    313,
    335,
    451,
    523,
    691,
    706,
    934,
    1182,
    1298,
]
print(outliers)

# remove outlers from predictors and response
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

print(train.data.shape)
print(train.target.shape)

# Feature importance

<a id = 'Feature-importance'></a>

In [None]:
# generate feature importance summary
estimators = [
    lightgbm.LGBMRegressor,
    ensemble.RandomForestRegressor,
    ensemble.GradientBoostingRegressor,
    ensemble.ExtraTreesRegressor,
    ensemble.AdaBoostRegressor,
    xgboost.XGBRegressor,
]

fs = train.FeatureSelector(
    data=train.data,
    target=train.target,
    estimators=estimators,
    rank=True,
    classification=False,
)
# feature_selector_summary = fs.featureSelectorSuite()
# feature_selector_summary[:20]

In [None]:
# calculate cross-validation performance
estimators = [
    svm.SVR,
    lightgbm.LGBMRegressor,
    xgboost.XGBRegressor,
    ensemble.RandomForestRegressor,
    ensemble.GradientBoostingRegressor,
    ensemble.AdaBoostRegressor,
    ensemble.ExtraTreesRegressor,
    neighbors.KNeighborsRegressor,
]

cvSummary = fs.feature_selector_cross_val(
    estimators=estimators,
    feature_selector_summary="featureSelectionSummary_20191027_022938.csv",
    scoring=["root_mean_squared_error"],
    n_folds=8,
    step=1,
    n_jobs=4,
)

###### Mean squared error

In [None]:
# visualize CV performance for diminishing feature set
fs.featureSelectorResultsPlot(
    metric="root_mean_squared_error",
    feature_selector_summary="featureSelectionSummary_20191027_022938.csv",
    cvSummary="cvSummary_20191028_125617.csv",
    showFeatures=False,
    markerOn=False,
    title_scale=0.8,
)

In [None]:
crossValFeaturesDf = fs.createCrossValFeaturesDf(
    metric="root_mean_squared_error",
    cvSummary= pd.read_csv("cvSummary_20191028_125617.csv", index_col=0),
    feature_selector_summary=pd.read_csv("featureSelectionSummary_20191027_022938.csv", index_col=0),
)
crossValFeaturesDf#[:5]

In [None]:
cross_val_feature_dict = fs.createCrossValFeaturesDict(
    crossValFeaturesDf=crossValFeaturesDf
)

##### Rationality

In [None]:
# percent difference summary
dfDiff = abs(
    (
        ((valid.data.describe() + 1) - (train.data.describe() + 1))
        / (train.data.describe() + 1)
    )
    * 100
)
dfDiff = dfDiff[dfDiff.columns].replace({0: np.nan})
dfDiff[dfDiff < 0] = np.nan
dfDiff = dfDiff.fillna("")
display(dfDiff)
display(train.data[dfDiff.columns].describe())
display(valid.data[dfDiff.columns].describe())

# Modeling

<a id = 'Modeling'></a>

## Data preparation

<a id = 'Data-preparation-1'></a>

In [None]:
#################################################################################
# import training data
df_train, df_valid = data.housing()
# df_train = pd.read_csv("s3://tdp-ml-datasets/kaggle-housing/train.csv")
train = mlm.Machine(
    data=df_train,
    target=["SalePrice"],
    remove_features=["Id", "MiscVal"],
    force_to_categorical=[
        "MSSubClass",
        "OverallQual",
        "OverallCond",
        "YearBuilt",
        "YearRemodAdd",
        "MoSold",
        "YrSold",
    ],
    target_type="numeric",
)

# additional features
train.data["BsmtFinSF"] = train.data["BsmtFinSF1"] + train.data["BsmtFinSF2"]
train.data["TotalSF"] = (
    train.data["TotalBsmtSF"] + train.data["1stFlrSF"] + train.data["2ndFlrSF"]
)

#################################################################################
# import validation data
# df_valid = pd.read_csv("s3://tdp-ml-datasets/kaggle-housing/test.csv")
valid = mlm.Machine(
    data=df_valid,
    remove_features=["Id", "MiscVal"],
    force_to_categorical=[
        "MSSubClass",
        "OverallQual",
        "OverallCond",
        "YearBuilt",
        "YearRemodAdd",
        "MoSold",
        "YrSold",
    ],
    target_type="numeric",
)

# change clearly erroneous value to what it probably was
valid.data["GarageYrBlt"].replace({2207: 2007}, inplace=True)

# additional features
valid.data["BsmtFinSF"] = valid.data["BsmtFinSF1"] + valid.data["BsmtFinSF2"]
valid.data["TotalSF"] = (
    valid.data["TotalBsmtSF"] + valid.data["1stFlrSF"] + valid.data["2ndFlrSF"]
)
valid.data.loc[valid.data["TotalSF"].isnull(), "TotalSF"] = (
    valid.data["1stFlrSF"] + valid.data["2ndFlrSF"]
)

train.feature_type_update()
valid.feature_type_update()

#################################################################################
# impute pipeline
categoricalConstant = ['GarageFinish', 'Alley', 'MasVnrType', 'GarageType', 'BsmtFinType1',
                       'BsmtCond', 'BsmtFinType2', 'BsmtQual', 'PoolQC', 'GarageCond',
                       'FireplaceQu', 'GarageQual', 'Fence', 'BsmtExposure', 'MiscFeature']
numericConstant = ["GarageYrBlt","MasVnrArea","BsmtUnfSF","GarageArea","BsmtFinSF","BsmtFinSF1","TotalBsmtSF","BsmtFinSF2"]
categoricalMode = ["Electrical","Functional","SaleType","Exterior1st","MSZoning","Exterior2nd","KitchenQual","Utilities"]
numericMode = ["BsmtHalfBath", "GarageCars", "BsmtFullBath"]

impute_pipe = PandasFeatureUnion([
    ("catConstant", pipeline.make_pipeline(
        DataFrameSelector(categoricalConstant),
        PlayWithPandas(impute.SimpleImputer(strategy="constant", fill_value="Nonexistent"))
    )),
    ("numConstant", pipeline.make_pipeline(
        DataFrameSelector(numericConstant),
        PlayWithPandas(impute.SimpleImputer(strategy="constant", fill_value=0))
    )),
    ("catMode", pipeline.make_pipeline(
        DataFrameSelector(categoricalMode),
        PlayWithPandas(impute.SimpleImputer(strategy="most_frequent"))
    )),
    ("numMode", pipeline.make_pipeline(
        DataFrameSelector(numericMode),
        PlayWithPandas(impute.SimpleImputer(strategy="most_frequent"))
    )),
    ("LotFrontage", pipeline.make_pipeline(
        DataFrameSelector(["LotFrontage","Neighborhood"]),
        ContextImputer(null_col="LotFrontage", context_col="Neighborhood", strategy="mean")
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(["LotFrontage"] + categoricalConstant + numericConstant + categoricalMode + numericMode))),
    )),
])

train.data = impute_pipe.fit_transform(train.data)
valid.data = impute_pipe.transform(valid.data)

#################################################################################
# feature transformation pipeline
nominal_columns = ["MSSubClass","MSZoning","LandContour","Neighborhood","Condition1","Condition2","BldgType",
    "HouseStyle","RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType","Foundation","Heating",
    "GarageType","Fence","SaleType","SaleCondition","MiscFeature",]
ordinal_columns = ["Street","Alley","LotShape","Utilities","LotConfig","LandSlope","ExterQual","ExterCond","BsmtQual",
    "BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","HeatingQC","CentralAir","Electrical","KitchenQual",
    "Functional","FireplaceQu","GarageFinish","GarageQual","GarageCond","PavedDrive","PoolQC",]

ordinal_encodings = [
    ["Grvl", "Pave"],  #  # Street
    ["Nonexistent", "Grvl", "Pave"],  # Alley
    ["IR3", "IR2", "IR1", "Reg"],  # LotShape
    ["ELO", "NoSeWa", "NoSewr", "AllPub"],  # Utilities
    ["FR3", "FR2", "Corner", "Inside", "CulDSac"],  # LotConfig
    ["Sev", "Mod", "Gtl"],  # LandSlope
    ["Po", "Fa", "TA", "Gd", "Ex"],  # ExterQual
    ["Po", "Fa", "TA", "Gd", "Ex"],  # ExterCond
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # BsmtQual
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # BsmtCond
    ["Nonexistent", "No", "Mn", "Av", "Gd"],  # BsmtExposure
    ["Nonexistent", "Unf", "LwQ", "BLQ", "Rec", "ALQ", "GLQ"],  # BsmtFinType1
    ["Nonexistent", "Unf", "LwQ", "BLQ", "Rec", "ALQ", "GLQ"],  # BsmtFinType2
    ["Po", "Fa", "TA", "Gd", "Ex"],  # HeatingQC
    ["N", "Y"],  # CentralAir
    ["FuseP", "FuseF", "FuseA", "Mix", "SBrkr"],  # Electrical
    ["Po", "Fa", "TA", "Gd", "Ex"],  # KitchenQual
    ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],  # Functional
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # FireplaceQu
    ["Nonexistent", "Unf", "RFn", "Fin"],  # GarageFinish
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # GarageQual
    ["Nonexistent", "Po", "Fa", "TA", "Gd", "Ex"],  # GarageCond
    ["N", "P", "Y"],  # PavedDrive
    ["Nonexistent", "Fa", "TA", "Gd", "Ex"],  # PoolQC
]

transform_pipe = PandasFeatureUnion([
    ("ordinal", pipeline.make_pipeline(
        DataFrameSelector(ordinal_columns),
        PlayWithPandas(preprocessing.OrdinalEncoder(categories=ordinal_encodings)),
    )),
    ("nominal", pipeline.make_pipeline(
        DataFrameSelector(nominal_columns),
        PlayWithPandas(preprocessing.OneHotEncoder(handle_unknown="ignore")),
    )),
    ("numeric", pipeline.make_pipeline(
        DataFrameSelector(train.feature_type["numeric"]),
        PlayWithPandas(preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
#         DualTransformer(),
        PlayWithPandas(preprocessing.StandardScaler()),
    )),
    ("diff", pipeline.make_pipeline(
        DataFrameSelector(list(set(train.data.columns).difference(nominal_columns + ordinal_columns + train.feature_type["numeric"]))),
    )),
])

train.data = transform_pipe.fit_transform(train.data)
valid.data = transform_pipe.transform(valid.data)

train.feature_type_update()
valid.feature_type_update()

#################################################################################
# remove outliers
outliers = [
    53,
    185,
    197,
    437,
    492,
    762,
    796,
    821,
    847,
    1161,
    1221,
    1318,
    1376,
    249,
    313,
    335,
    451,
    523,
    691,
    706,
    934,
    1182,
    1298,
]
train.data = train.data.drop(outliers)
train.target = train.target.drop(index=outliers)

# log transform target
train.target = np.log1p(train.target)

# # accuracy >= 7
# bestCols = ['Age*Fare','Title_2','Fare*FamilySize','Sex_male','Fare','Pclass','CabinQuarter_X']
# train.data = train.data[bestCols]
# valid.data = valid.data[bestCols]

# print('completed')

## Bayesian hyper-parameter optimization

<a id = 'Bayesian-hyper-parameter-optimization'></a>

In [None]:
# model/parameter space
all_space = {
    "linear_model.Lasso": {"alpha": hp.uniform("alpha", 0.0000001, 20)},
    "linear_model.Ridge": {"alpha": hp.uniform("alpha", 0.0000001, 20)},
    "linear_model.ElasticNet": {
        "alpha": hp.uniform("alpha", 0.0000001, 20),
        "l1_ratio": hp.uniform("l1_ratio", 0.0, 0.2),
    },
    "kernel_ridge.KernelRidge": {
        "alpha": hp.uniform("alpha", 0.000001, 15),
        "kernel": hp.choice("kernel", ["linear", "polynomial", "rbf"]),
        "degree": hp.choice("degree", [2, 3]),
        "gamma": hp.uniform("gamma", 0.0, 10),
    },
    "lightgbm.LGBMRegressor": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "boosting_type": hp.choice("boosting_type", ["gbdt", "dart", "goss"])
        # ,'boosting_type': hp.choice('boosting_type'
        #                    ,[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'goss', 'subsample': 1.0}])
        ,
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_samples": hp.uniform("min_child_samples", 20, 500),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "num_leaves": hp.uniform("num_leaves", 8, 150),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.uniform("subsample_for_bin", 20000, 400000),
    },
    "xgboost.XGBRegressor": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "gamma": hp.uniform("gamma", 0.0, 10),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_weight": hp.uniform("min_child_weight", 1, 20),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.5, 1),
    },
    "ensemble.RandomForestRegressor": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.GradientBoostingRegressor": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "loss": hp.choice("loss", ["ls", "lad", "huber", "quantile"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.AdaBoostRegressor": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "loss": hp.choice("loss", ["linear", "square", "exponential"]),
    },
    "ensemble.ExtraTreesRegressor": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
    },
    "svm.SVR": {
        "C": hp.uniform("C", 0.00001, 10),
        "kernel": hp.choice("kernel", ["linear", "poly", "rbf", "sigmoid"]),
        "degree": hp.choice("degree", [2, 3]),
        "gamma": hp.uniform("gamma", 0.0001, 10),
        "epsilon": hp.uniform("epsilon", 0.001, 5),
    },
    "neighbors.KNeighborsRegressor": {
        "algorithm": hp.choice("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 20, dtype=int)),
        "weights": hp.choice("weights", ["distance", "uniform"]),
        "p": hp.choice("p", [1, 2]),
    },
}

In [None]:
# execute bayesian optimization grid search
train.exec_bayes_optim_search(
    all_space=all_space,
    data=train.data,
    target=train.target,
    scoring="root_mean_squared_error",
    columns=cross_val_feature_dict,
    n_folds=5,
    n_jobs=8,
    iters=750,
    verbose=0,
)

##### Model loss by iteration

In [None]:
# read scores summary table
bayes_optim_summary = pd.read_csv("", na_values="nan")
bayes_optim_summary[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayes_optim_summary["estimator"]):
    train.model_loss_plot(bayes_optim_summary=bayes_optim_summary, estimator=estimator)

##### Parameter selection by iteration

In [None]:
# estimator parameter plots
for estimator in np.unique(bayes_optim_summary['estimator']):
    train.modelParamPlot(bayes_optim_summary = bayes_optim_summary,
                         estimator=estimator,
                         all_space=all_space,
                         n_iter=100,
                         chart_prop=15)

In [None]:
sample_space = {
                'param': hp.uniform('param', np.log(0.4), np.log(0.6))
#     "": 0.000001 + hp.uniform("gamma", 0.000001, 10)
    #             'param2': hp.loguniform('param2', np.log(0.001), np.log(0.01))
}

train.sample_plot(sample_space, 1000)

## Model performance evaluation - standard models

<a id = 'Model-performance-evaluation-standard-models'></a>

In [None]:
top_models = train.topBayesOptimModels(bayes_optim_summary=bayes_optim_summary, numModels=1)
top_models

In [None]:
## standard model fit and predict
# select estimator and iteration
estimator = "lightgbm.LGBMRegressor"; model_iter = 417
# estimator = "xgboost.XGBRegressor"; model_iter = 418
# estimator = "ensemble.RandomForestRegressor"; model_iter = 382
# estimator = "ensemble.GradientBoostingRegressor"; model_iter = 238
# estimator = "svm.SVR"; model_iter = 259

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)
model.fit(train.data.values, train.target.values)

X_train, X_valid, y_train, yValid = model_selection.train_test_split(train.data, train.target)
yPred = model.predict(train.data.values)

In [None]:

feature_selector_summary = train.regression_panel(
    model=model,
    X_train=train.data,
    y_train=train.target,
    X_valid=None,
    yValid=None,
#     X_train=X_train,
#     y_train=y_train,
#     X_valid=X_valid,
#     yValid=yValid,
    n_folds=4,
    randomState=10,
#     feature_selector_summary=None
)
feature_selector_summary

In [None]:
results = pd.DataFrame(data={'prediction' : model.predict(train.data.values),
                  'actual' : train.target.values
                  },
             index=train.target.index
            )
results['diff'] = results['prediction'] - results['actual']
results['diffAbs'] = abs(results['prediction'] - results['actual'])
results['diffPerc'] = ((results['prediction'] - results['actual']) / results['actual']) * 100

results.sort_values(['diffAbs'], ascending = False)[:10]

## Model explanability

<a id = 'Feature-importance'></a>

In [None]:
# 
estimator = "xgboost.XGBRegressor"; model_iter = 418

model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)

model.fit(train.data.values, train.target.values)

##### Permutation importance

In [None]:
# permutation importance - how much does performance decrease when shuffling a certain feature?
perm = PermutationImportance(model.model, random_state=1).fit(train.data, train.target)
eli5.show_weights(perm, feature_names=train.data.columns.tolist())

##### SHAP values - training data

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in train.data.index[:5]:
    train.single_shap_viz_tree(obsIx=i, model=model, data=train.data, target=train.target, classification=False)

###### Force plots - multiple observations

In [None]:
# SHAP force plot a set of data
visual = train.multi_shap_viz_tree(obs_ixs=train.data.index, model=model, data=train.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = train.multi_shap_value_tree(
    obs_ixs=train.data.index, model=model, data=train.data
)

In [None]:
# SHAP dependence plot grid
grid_features = ["OverallCond","LotFrontage","TotalSF","BsmtFinSF","LotConfig"]

train.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=train.data.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

train.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="TotalSF",
    color_feature="LotFrontage",
    feature_names=train.data.columns.tolist(),
    dot_size=50,
    alpha=0.5,
    ax=ax    
)

In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = train.data.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

# generate force plot
for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()
    
    train.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Age",
        feature_names=feature_names,
        dot_size=35,
        alpha=0.5,
        ax=ax
    )

###### Summary plots

In [None]:
# SHAP summary plot
feature_names = train.data.columns.tolist()
train.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=feature_names,
    )


##### SHAP values - talidation

###### Force plots - single observations

In [None]:
# SHAP force plots for individual observations
for i in valid.data.index[:5]:
    valid.single_shap_viz_tree(obsIx=i, model=model, data=valid.data, classification=False)

###### Force plots - multiple observations

In [None]:
# SHAP force plot a set of data
visual = valid.multi_shap_viz_tree(obs_ixs=valid.data.index, model=model, data=valid.data)
visual

###### Dependence plots

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = valid.multi_shap_value_tree(
    obs_ixs=valid.data.index, model=model, data=valid.data
)

In [None]:
# SHAP dependence plot grid
grid_features = ["OverallCond","LotFrontage","TotalSF","BsmtFinSF","LotConfig"]

valid.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=valid.data.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

valid.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="TotalSF",
    color_feature="LotFrontage",
    feature_names=valid.data.columns.tolist(),
    dot_size=50,
    alpha=0.5,
    ax=ax    
)

In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = valid.data.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

# generate force plot
for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()
    
    valid.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Age",
        feature_names=feature_names,
        dot_size=35,
        alpha=0.5,
        ax=ax
    )

###### Summary plots

In [None]:
# SHAP summary plot
feature_names = valid.data.columns.tolist()
valid.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=feature_names,
    )


## Submission - standard models

<a id = 'Submission-standard-models'></a>

In [None]:
## standard model fit and predict
# select estimator and iteration
estimator = "lightgbm.LGBMClassifier"; model_iter = 668
# estimator = "xgboost.XGBClassifier"; model_iter = 380
# estimator = "ensemble.RandomForestClassifier"; model_iter = 411
# estimator = "ensemble.GradientBoostingClassifier"; model_iter = 590
# estimator = "svm.SVC"; model_iter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary, estimator=estimator, model_iter=model_iter
)
model.fit(train.data.values, train.target.values)

# fit model and make predictions
yPred = model.predict(valid.data.values)

In [None]:
# generate prediction submission file
submit = pd.DataFrame({"Id": dfTest.Id, "SalePrice": np.expm1(yPred)})
submit.to_csv("data/submission.csv", index=False)

# Stacking

<a id = 'Stacking'></a>

## Primary models

<a id = 'Primary-models'></a>

In [None]:
# get out-of-fold predictions
oof_train, oof_valid, columns = train.model_stacker(
    models=top_models,
    bayes_optim_summary=bayes_optim_summary,
    X_train=train.data.values,
    y_train=train.target.values,
    X_valid=valid.data.values,
    n_folds=10,
    n_jobs=10,
)

In [None]:
# view correlations of predictions
p = PrettierPlot()
ax = p.make_canvas()
p.pretty_corr_heatmap(
    df=pd.DataFrame(oof_train, columns=columns), annot=True, ax=ax, vmin=0
)

## Meta model

<a id = 'Meta-model'></a>

In [None]:
# model/parameter space
all_space = {
    "kernel_ridge.KernelRidge": {
        "alpha": hp.uniform("alpha", 0.000001, 15),
        "kernel": hp.choice("kernel", ["linear", "polynomial", "rbf"]),
        "degree": hp.choice("degree", [2, 3]),
        "gamma": hp.uniform("gamma", 0.0, 10),
    },
    "lightgbm.LGBMRegressor": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "boosting_type": hp.choice("boosting_type", ["gbdt", "dart", "goss"])
        # ,'boosting_type': hp.choice('boosting_type'
        #                    ,[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'goss', 'subsample': 1.0}])
        ,
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_samples": hp.uniform("min_child_samples", 20, 500),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "num_leaves": hp.uniform("num_leaves", 8, 150),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.uniform("subsample_for_bin", 20000, 400000),
    },
    "xgboost.XGBRegressor": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "gamma": hp.uniform("gamma", 0.0, 10),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.0),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_weight": hp.uniform("min_child_weight", 1, 20),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.5, 1),
    },
    "ensemble.RandomForestRegressor": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "ensemble.GradientBoostingRegressor": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "learning_rate": hp.uniform("learning_rate", 0.000001, 0.2),
        "loss": hp.choice("loss", ["ls", "lad", "huber", "quantile"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "svm.SVR": {
        "C": hp.uniform("C", 0.00001, 10),
        "kernel": hp.choice("kernel", ["linear", "poly", "rbf", "sigmoid"]),
        "degree": hp.choice("degree", [2, 3]),
        "gamma": hp.uniform("gamma", 0.0001, 10),
        "epsilon": hp.uniform("epsilon", 0.001, 5),
    },
}

In [None]:
# execute bayesian optimization grid search
train.exec_bayes_optim_search(
    all_space=all_space,
    resultsDir="{}_hyperopt_meta_{}.csv".format(rundate, analysis),
    X=oof_train,
    y=train.target,
    scoring="accuracy",
    n_folds=8,
    n_jobs=10,
    iters=1000,
    verbose=0,
)

In [None]:
# read scores summary table
analysis = "housing"
rundate = "20190807"
bayes_optim_summary_meta = pd.read_csv("{}_hyperopt_meta_{}.csv".format(rundate, analysis))
bayes_optim_summary_meta[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayes_optim_summary_meta["estimator"]):
    train.model_loss_plot(bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator)

In [None]:
# estimator parameter plots
for estimator in np.unique(bayes_optim_summary_meta["estimator"]):
    train.modelParamPlot(
        bayes_optim_summary=bayes_optim_summary_meta,
        estimator=estimator,
        all_space=all_space,
        n_iter=100,
        chart_prop=15,
    )

## Model performance evaluation - stacked models

<a id = 'Model-performance-evaluation-stacked-models'></a>

In [None]:
top_models = train.topBayesOptimModels(
    bayes_optim_summary=bayes_optim_summary_meta, numModels=1
)
top_models

In [None]:
# best second level learning model
estimator = "lightgbm.LGBMClassifier"; model_iter = 668
# estimator = "xgboost.XGBClassifier"; model_iter = 380
# estimator = "ensemble.RandomForestClassifier"; model_iter = 411
# estimator = "ensemble.GradientBoostingClassifier"; model_iter = 590
# estimator = "svm.SVC"; model_iter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)

# single model evaluation here

In [None]:
# ,multi model evaluation here

## Submission - stacked models

<a id = 'Submission-stacked-models'></a>

In [None]:
# best second level learning model
estimator = "lightgbm.LGBMClassifier"; model_iter = 668
# estimator = "xgboost.XGBClassifier"; model_iter = 380
# estimator = "ensemble.RandomForestClassifier"; model_iter = 411
# estimator = "ensemble.GradientBoostingClassifier"; model_iter = 590
# estimator = "svm.SVC"; model_iter = 135

# extract params and instantiate model
model = train.BayesOptimModelBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)

model.fit(oof_train, train.target.values)
yPred = model.predict(oof_valid)
# print(sum(yPred))

In [None]:
# generate prediction submission file
submit = pd.DataFrame({"Id": dfTest.Id, "SalePrice": np.expm1(yPred)})
submit.to_csv("data/submission.csv", index=False)