# List of Dependencies

In [None]:
!pip install tqdm
!pip install 'seaborn == 0.11.0'
!pip install xgboost
!pip install catboost
# !pip install statsmodels
# !pip install comet_ml
# !pip install sklearn-genetic
# !pip install alibi

In [None]:
import time
from itertools import product
from math import ceil

import ast
import numpy as np
import os
import pandas as pd
import pickle
import re
import seaborn as sns
import tensorflow as tf
import warnings
from catboost import CatBoostClassifier
from matplotlib import pyplot as plt
from scipy.cluster import hierarchy
from scipy.sparse import csr_matrix
from scipy.stats import spearmanr
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
from sklearn.feature_selection import chi2, f_classif
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    RocCurveDisplay,
    auc,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    StratifiedKFold,
    StratifiedShuffleSplit,
    cross_val_predict,
    cross_validate,
    validation_curve,
)
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
from xgboost import XGBClassifier

In [None]:
RANDOM_SEED = 42

# Set random seed for Numpy and TensorFlow
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Set the default font size of all matplotlib plots
plt.rcParams.update({'font.size': 12})

# Set the display option of pandas objects
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

The functions below are needed to pickle objects later on to cut down total execution time.

In [None]:
PICKLE_PATH = '../input/pickles/health_insurance/'


def dump_objects(file_name, *objects):
    with open(f'{file_name}.sav', 'wb') as file:
        for obj in objects:
            pickle.dump(obj, file)


def load_objects(file_name, num_objects=1):
    objects = []
    with open(f'{PICKLE_PATH}{file_name}.sav', 'rb') as file:
        while num_objects > 0:
            objects.append(pickle.load(file))
            num_objects -= 1
    return objects

# Fetch Data

The dataset is obtained from https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction

In [None]:
PARENT_DIRECTORY = '../input/health-insurance-cross-sell-prediction'
TRAIN_DATA_PATH = 'train.csv'
TEST_DATA_PATH = 'test.csv'
SAMPLE_SUBMISSION_PATH = 'sample_submission.csv'


def fetch_data(path, parent_dir=PARENT_DIRECTORY):
    path = os.path.join(PARENT_DIRECTORY, path)
    return pd.read_csv(path, low_memory=False)

In [None]:
insurance_train = fetch_data(TRAIN_DATA_PATH)
insurance_train.head(5)

In [None]:
insurance_train.info()

In [None]:
def get_num_cat_attrs(df, y_feature, num_dtype='float64'):
    num_attr = []
    cat_attr = []

    num_dtypes = ['float64', 'int64']

    for col in list(df.columns):
        dtype = df[col].dtype
        if dtype in num_dtypes:
            df[col] = df[col].astype(num_dtype)
            num_attr.append(col)
        else:
            cat_attr.append(col)

    if y_feature in num_attr:
        num_attr.remove(y_feature)
    else:
        cat_attr.remove(y_feature)

    print(f'Numerical attributes: {", ".join(num_attr)}')
    print(f'Categorical attributes: {", ".join(cat_attr)}')

    return df, num_attr, cat_attr

In [None]:
Y_FEATURE = 'Response'
insurance_train2, num_attr, cat_attr = get_num_cat_attrs(
    insurance_train, Y_FEATURE)

# Data Analysis & Visualization

In [None]:
def create_subplots(cols, num_cols_per_row, fig_w, fig_h):
    num_rows = ceil(len(cols) / num_cols_per_row)
    indexes = list(product(range(num_rows), range(num_cols_per_row)))
    fig, axs = plt.subplots(num_rows, num_cols_per_row)
    fig.set_size_inches(fig_w, num_rows * fig_h)
    return num_rows, indexes, axs, fig


def plot_countplot(df, cols, num_cols_per_row=4, fig_w=16,
                   fig_h=7, rotation=0, color='steelblue'):
    num_rows, indexes, axs, fig = create_subplots(
        cols, num_cols_per_row, fig_w, fig_h)

    with tqdm(total=100) as pbar:
        progress_unit = 100/len(cols)

        for idx, col in enumerate(cols):
            ax = axs if num_cols_per_row == 1 else axs[
                idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
            sns.countplot(y=col, data=df, color=color, ax=ax)
            ax.set(title=col, ylabel=None)
            ax.tick_params(axis='x', rotation=rotation)

            pbar.update(progress_unit)
    fig.tight_layout()
    plt.show()
    plt.close()


def plot_countplot_by_class(df, cols, num_cols_per_row=4, fig_w=16,
                            fig_h=7, rotation=0, y_feature=Y_FEATURE):
    num_rows, indexes, axs, fig = create_subplots(
        cols, num_cols_per_row, fig_w, fig_h)

    with tqdm(total=100) as pbar:
        progress_unit = 100/len(cols)

        for idx, col in enumerate(cols):
            ax = axs if num_cols_per_row == 1 else axs[
                idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
            sns.countplot(y=y_feature, hue=col, data=df, ax=ax)
            ax.set(title=col, ylabel=None)
            ax.tick_params(axis='x', rotation=rotation)

            pbar.update(progress_unit)
    fig.tight_layout()
    plt.show()
    plt.close()


def plot_boxplot(df, cols, num_cols_per_row=4, fig_w=16, fig_h=7):
    num_rows, indexes, axs, fig = create_subplots(
        cols, num_cols_per_row, fig_w, fig_h)

    with tqdm(total=100) as pbar:
        progress_unit = 100/len(cols)

        for idx, col in enumerate(cols):
            ax = axs if num_cols_per_row == 1 else axs[
                idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
            sns.boxplot(y=df[col], ax=ax)
            ax.set(title=col, ylabel=None)
            pbar.update(progress_unit)
    fig.tight_layout()
    plt.show()
    plt.close()


def plot_hist(df, cols, kde=True, num_cols_per_row=3, fig_w=18, fig_h=5):
    num_rows, indexes, axs, fig = create_subplots(
        cols, num_cols_per_row, fig_w, fig_h)

    with tqdm(total=100) as pbar:
        progress_unit = 100/len(cols)

        for idx, col in enumerate(cols):
            ax = axs if num_cols_per_row == 1 else axs[
                idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
            data_range = (df[col].max() - df[col].min())
            binwidth = data_range / 50 if data_range >= 50 else None
            sns.histplot(data=df, x=col, kde=kde, ax=ax, binwidth=binwidth)
            ax.set(title=col, xlabel=None)
            pbar.update(progress_unit)
    fig.tight_layout()
    plt.show()
    plt.close()


def plot_dendro_corr(X, feature_names, fig_w, fig_h,
                     orientation='top', font_size=15,
                     rotation=90):
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    corr = spearmanr(X).correlation
    corr_linkage = hierarchy.ward(corr)
    dendro = hierarchy.dendrogram(
        corr_linkage, labels=feature_names, ax=ax, leaf_rotation=rotation,
        leaf_font_size=font_size, orientation=orientation
    )
    fig.tight_layout()
    plt.show()
    plt.close()


def plot_heatmap_corr_full(X, X_features, fig_w, fig_h, annot=False, enable_mask=True):

    fig, ax = plt.subplots(figsize=(fig_w, fig_h))

    corr = X[X_features].corr(method='spearman')
    corr.index

    # triu
    if enable_mask:
        mask = np.tril(np.ones_like(corr, dtype=bool))
    else:
        mask = False
    sns.heatmap(corr, linewidths=0.1, linecolor='white',
                square=True, annot=annot, mask=mask,
                vmin=-1, vmax=1, center=0, ax=ax,
                xticklabels=True,
                yticklabels=True)

    fig.tight_layout()
    plt.tick_params(axis='both', which='minor', labelsize=15)
    plt.show()
    plt.close()


def plot_heatmap_corr(X, X_features, selected_features,
                      fig_w, fig_h, annot=False):

    fig, ax = plt.subplots(figsize=(fig_w, fig_h))

    corr = X[X_features].corr(method='spearman')[
        selected_features].drop(index=selected_features)
    non_selected_features = corr.index
    x_axis = selected_features
    y_axis = non_selected_features
    if len(y_axis) < len(x_axis):
        corr = corr.T
        xticklabels = non_selected_features
        yticklabels = selected_features
    else:
        xticklabels = selected_features
        yticklabels = non_selected_features

    sns.heatmap(corr, linewidths=0.1, linecolor='white',
                square=True, annot=annot,
                vmin=-1, vmax=1, center=0, ax=ax,
                xticklabels=True,
                yticklabels=True)

    ax.set_xticklabels(xticklabels, rotation='vertical')
    ax.set_yticklabels(yticklabels, rotation='horizontal')
    fig.tight_layout()
    plt.tick_params(axis='both', which='minor', labelsize=15)
    plt.show()
    plt.close()


def plot_attr_dist_by_class_table(data, col, y_feature=Y_FEATURE):
    classes = sorted(list(data[y_feature].value_counts().index))
    x = list(data[col].value_counts().index)
    result = []

    for class_ in classes:
        data_tmp = data.loc[data[y_feature] == class_, :]
        value_count = data_tmp[col].value_counts(normalize=True)
        result.append(value_count)

    return pd.DataFrame(np.array(result), index=classes, columns=[str(i)+'_value' for i in x])


def compare_distribution_by_class(df, cols, num_cols_per_row=4, y_feature=Y_FEATURE,
                                  fig_w=16, fig_h=7, rotation=0, color='steelblue'):
    num_rows, indexes, axs, fig = create_subplots(
        cols, num_cols_per_row, fig_w, fig_h)

    with tqdm(total=100) as pbar:
        progress_unit = 100/len(cols)

        for idx, col in enumerate(cols):
            ax = axs if num_cols_per_row == 1 else axs[
                idx] if num_rows == 1 else axs[indexes[idx][0]][indexes[idx][1]]
            sns.boxplot(x=Y_FEATURE, y=col, data=df, ax=ax)
            ax.set(title=col, ylabel=None)
            ax.tick_params(axis='x', rotation=rotation)

            pbar.update(progress_unit)
    fig.tight_layout()
    plt.show()
    plt.close()

In [None]:
plot_countplot(insurance_train2, [Y_FEATURE],
               num_cols_per_row=1, fig_w=5, fig_h=4, rotation=90)

In [None]:
plot_countplot_by_class(insurance_train2, cat_attr +
                        [num_attr[2], num_attr[4]], num_cols_per_row=3, fig_w=15, fig_h=4, rotation=90)

In [None]:
insurance_train2['Driving_License'].value_counts(normalize=True)

Check if the all id values are unique to ensure all records are unique.

In [None]:
len(insurance_train2['id'].value_counts().index) == len(insurance_train2)

Drop *id* columns since it is irrelevant.

In [None]:
insurance_train2 = insurance_train2.drop(columns='id', axis=1)

In [None]:
attr = ['Age', 'Region_Code', 'Policy_Sales_Channel',
        'Vintage', 'Annual_Premium']
plot_hist(insurance_train2, attr, num_cols_per_row=3, fig_w=12, fig_h=4)

Since *Annual_Premium* contains many outliers, we are going to use [robust scaler from scikit-learn API](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)
to standardize the feature.

In [None]:
compare_distribution_by_class(insurance_train2, cols=['Vintage', 'Age', 'Annual_Premium'],
                              num_cols_per_row=3, fig_w=12, fig_h=5)

## Data Wrangling

Convert *Gender* to *Gender_female*, *Vechicle_Damage* to binary values of 0 or 1.
It is because *Gender_female* is negatively correlated with *Gender_male*, therefore we
only need one of them.

In [None]:
index = insurance_train2['Gender'] == 'Male'
insurance_train2['Gender_female'] = 1.0
insurance_train2.loc[index, 'Gender_female'] = 0.0

print(insurance_train2['Gender'].value_counts(dropna=False))
print(insurance_train2['Gender_female'].value_counts(dropna=False))

insurance_train2 = insurance_train2.drop(columns='Gender', axis=1)

In [None]:
index = insurance_train2['Vehicle_Damage'] == 'Yes'
insurance_train2.loc[~index, 'Vehicle_Damage'] = 0.0
insurance_train2.loc[index, 'Vehicle_Damage'] = 1.0
insurance_train2['Vehicle_Damage'] = insurance_train2['Vehicle_Damage'].astype(
    'float64')

Check for null values in the dataset.

In [None]:
insurance_train2 = insurance_train2.copy()
np.any(insurance_train2.isna(), axis=0)

## Statistical Independence Test

Reassign the types of features.

In [None]:
num_attr = ['Age', 'Vintage']
num_attr_with_outliers = ['Annual_Premium']
binary_cat_attr = ['Driving_License', 'Gender_female',
                   'Vehicle_Damage', 'Previously_Insured']
cat_attr = ['Policy_Sales_Channel', 'Vehicle_Age', 'Region_Code']
Y_FEATURE = 'Response'

### Chi-Squared Test

<p style='line-indent:5.0%;line-height:2.0;text-align:justify;'>
This score can be used to select the n_features features with the highest values for the test chi-squared statistic from X, which must contain only non-negative features such as booleans or frequencies (e.g., term counts in document classification), relative to the classes.
</p>
<p style='line-indent:5.0%;line-height:2.0;text-align:justify;'>
The chi-square test measures dependence between stochastic variables, so using this function "weeds out" the features that are the most likely to be independent of class and therefore irrelevant for classification.
</p>

[Source](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html)

Get the one-hot-encoded categorical variables to perform chi-squared test.

In [None]:
insurance_train2[cat_attr] = insurance_train2[cat_attr].astype('object')
insurance_train2.info()

In [None]:
insurance_train3 = pd.get_dummies(insurance_train2.copy(), columns=cat_attr)
insurance_train3.shape

In [None]:
preprocessed_attr = insurance_train3.columns.tolist()
one_hot_encoded_attr = [list(filter(lambda x: re.match(
    f'^{attr}', x), preprocessed_attr)) for attr in cat_attr]
one_hot_encoded_attr = sum(one_hot_encoded_attr, [])

Calculate the chi-squared values between categorical features and y features.

In [None]:
DECIMAL_PLACE = 5
attr = binary_cat_attr+one_hot_encoded_attr
X = insurance_train3[attr]
y = insurance_train3[Y_FEATURE]

chi2_values, chi2_p_values = chi2(X, y)

chi2_result = zip(chi2_values, chi2_p_values)
chi2_result = pd.DataFrame(chi2_result, index=attr, columns=[
                           'chi2_value', 'p_value'])
chi2_result['chi2_value'] = chi2_result['chi2_value'].apply(
    lambda x: round(x, DECIMAL_PLACE - 3))
chi2_result['p_value'] = chi2_result['p_value'].apply(
    lambda x: round(x, DECIMAL_PLACE))
chi2_result = chi2_result.sort_values('chi2_value', ascending=False)
chi2_result

The categorical features with p-value of more than 1% is consider as independent of y feature.

In [None]:
p_value_cutoff = 0.01
insignificant = chi2_result['p_value'] > p_value_cutoff
chi2_result_low_p_val = chi2_result[insignificant].sort_values('p_value')
chi2_result_low_p_val

In [None]:
less_significant_features = chi2_result_low_p_val.index.tolist()
print(
    f'We have eliminated {len(less_significant_features)} features with low chi-squared value.')

### ANOVA f-test

<p style="text-align: justify; line-height: 2.0;">
ANOVA is used when one variable is numeric and one is categorical, such as numerical input variables and a classification target variable in a classification task.
</p>
<p style="text-align: justify; line-height: 2.0;">
The results of this test can be used for feature selection where those features that are
independent of the target variable can be removed from the dataset.
</p>
<p style="text-align: justify; line-height: 2.0;">
Our p-value cutoff is 0.01.
</p>
<p style="text-align: justify; line-height: 2.0;">
Analysis of variance (ANOVA) uses F-tests to statistically assess the equality of means for two or more groups.
Based on the result, Age and Annual Premium has difference of means in the two groups which are statistically significant.
</p>
<p style="text-align: justify; line-height: 2.0;">
Since Vintage has very low f_value and high p_value, we are going to remove this feature.
</p>

In [None]:
attr = num_attr + num_attr_with_outliers
X = insurance_train3[attr]
y = insurance_train3[Y_FEATURE]

anova_f_values, anova_p_values = f_classif(X, y)

anova_result = zip(anova_f_values, anova_p_values)
anova_result = pd.DataFrame(
    anova_result, index=attr, columns=['f_value', 'p_value'])
anova_result['f_value'] = anova_result['f_value'].apply(
    lambda x: round(x, DECIMAL_PLACE - 3))
anova_result['p_value'] = anova_result['p_value'].apply(
    lambda x: round(x, DECIMAL_PLACE))
anova_result = anova_result.sort_values('f_value', ascending=False)
anova_result

In [None]:
less_significant_features.append('Vintage')
num_attr.remove('Vintage')

## Correlation

In [None]:
insurance_train4 = insurance_train3.drop(
    columns=less_significant_features, axis=1)
insurance_train4.shape

In [None]:
def correlation_with_output(data, y_feature=Y_FEATURE,
                            method='pearson'):
    cor = insurance_train2.corr(method=method)
    cor_target = abs(cor[Y_FEATURE])
    return cor_target.sort_values(ascending=False)[1:]

Every features are not quite correlated with each other since all of them have low Spearman's correlation coefficient (< 0.5).

In [None]:
attr = num_attr + num_attr_with_outliers
correlation_with_output(insurance_train4[attr], method='spearman')

## Data Cleaning Pipeline

In [None]:


class DataCleaner(BaseEstimator, TransformerMixin):

    def __init__(self, features_to_be_dropped=None):
        self.features_to_be_dropped = features_to_be_dropped

    def fit(self, data, y=None):
        return self

    def transform(self, data, y=None):

        data = data.copy()
        data = data.reset_index(drop=True)
        data = data.drop(columns='id', axis=1)

        index = data['Gender'] == 'Male'
        data['Gender_female'] = 1.0
        data.loc[index, 'Gender_female'] = 0.0

        data = data.drop(columns='Gender', axis=1)

        index = data['Vehicle_Damage'] == 'Yes'
        data.loc[~index, 'Vehicle_Damage'] = 0.0
        data.loc[index, 'Vehicle_Damage'] = 1.0
        data['Vehicle_Damage'] = data['Vehicle_Damage'].astype('float64')

        num_attr = ['Age', 'Vintage']
        binary_cat_attr = ['Driving_License', 'Gender_female',
                           'Vehicle_Damage', 'Previously_Insured']
        cat_attr = ['Policy_Sales_Channel', 'Vehicle_Age', 'Region_Code']
        Y_FEATURE = 'Response'

        data[cat_attr] = data[cat_attr].astype('object')

        data = pd.get_dummies(data.copy(), columns=cat_attr)

        preprocessed_attr = data.columns.tolist()
        one_hot_encoded_attr = [list(filter(lambda x: re.match(
            f'^{attr}', x), preprocessed_attr)) for attr in cat_attr]
        one_hot_encoded_attr = sum(one_hot_encoded_attr, [])

        DECIMAL_PLACE = 5
        attr = binary_cat_attr+one_hot_encoded_attr
        X = data[attr]
        y = data[Y_FEATURE]

        chi2_values, chi2_p_values = chi2(X, y)

        chi2_result = zip(chi2_values, chi2_p_values)
        chi2_result = pd.DataFrame(chi2_result, index=attr, columns=[
                                   'chi_value', 'p_value'])
        chi2_result['chi_value'] = chi2_result['chi_value'].apply(
            lambda x: round(x, DECIMAL_PLACE - 3))
        chi2_result['p_value'] = chi2_result['p_value'].apply(
            lambda x: round(x, DECIMAL_PLACE))
        chi2_result = chi2_result.sort_values('chi_value', ascending=False)

        p_value_cutoff = 0.01
        insignificant = chi2_result['p_value'] > p_value_cutoff
        chi2_result_low_p_val = chi2_result[insignificant].sort_values(
            'p_value')
        chi2_result_low_p_val

        less_significant_features = chi2_result_low_p_val.index.tolist()

        less_significant_features.append('Vintage')
        num_attr.remove('Vintage')

        data = data.drop(columns=less_significant_features, axis=1)

        return data

## Data Preprocessing Pipeline

In [None]:
class DataPreprocessor():

    def __init__(self, data, num_attr, cat_attr, y_feature,
                 binary_cat_attr=None, num_attr_with_outliers=None):
        self.num_attr = num_attr
        self.cat_attr = cat_attr
        self.y_feature = y_feature
        self.binary_cat_attr = binary_cat_attr
        self.num_attr_with_outliers = num_attr_with_outliers
        self.data = data
        if cat_attr is not None:
            self.generate_1_hot_enc(self.data)
        else:
            self.one_hot_enc = None

    def get_1_hot_enc(self):
        return self.one_hot_enc

    def generate_1_hot_enc(self, data):
        # Generate one-hot-encoded feature's names
        index = np.any(pd.isnull(data[self.cat_attr]), axis=1)
        X_cat = data.loc[~index, self.cat_attr].copy()
        one_hot_enc = OneHotEncoder()
        one_hot_enc.fit(X_cat)
        self.one_hot_enc = one_hot_enc

    def get_1_hot_attr(self):
        return self.one_hot_enc.get_feature_names(self.cat_attr)

    def generate_preprocessed_attr(self):
        one_hot_attr = self.get_1_hot_attr()
        return self.num_attr + self.num_attr_with_outliers + \
            self.binary_cat_attr + list(one_hot_attr)

    def preprocessX(self, X):
        X_train_preprocessed = self.make_preprocess_pipeline().fit_transform(X)
        preprocessed_attr = self.generate_preprocessed_attr()
        if type(X_train_preprocessed) == csr_matrix:
            X_train_preprocessed = X_train_preprocessed.toarray()
        X_train_preprocessed = pd.DataFrame(
            X_train_preprocessed, columns=preprocessed_attr)
        return X_train_preprocessed

    def preprocessX_without_standardization(self, X):
        X_train_preprocessed = self.make_preprocess_pipeline2().fit_transform(X)
        preprocessed_attr = self.generate_preprocessed_attr()
        if type(X_train_preprocessed) == csr_matrix:
            X_train_preprocessed = X_train_preprocessed.toarray()
        X_train_preprocessed = pd.DataFrame(
            X_train_preprocessed, columns=preprocessed_attr)
        return X_train_preprocessed

    # Data Preprocessing Pipeline
    def make_preprocess_pipeline(self):

        numerical_pipeline = make_pipeline(
            SimpleImputer(strategy='mean'),
            StandardScaler()
        )

        robust_numerical_pipeline = make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            RobustScaler()
        )

        binary_categorical_pipeline = make_pipeline(
            SimpleImputer(strategy='most_frequent')
        )

        if self.one_hot_enc is None:
            categories = None
        else:
            categories = self.one_hot_enc.categories_

        categorical_pipleline = make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(categories=categories)
        )

        pipeline = []

        if self.num_attr is not None:
            pipeline.append(('num_pp', numerical_pipeline, self.num_attr))

        if self.num_attr_with_outliers is not None:
            pipeline.append(
                ('robust_num_pp', robust_numerical_pipeline, self.num_attr_with_outliers))

        if self.binary_cat_attr is not None:
            pipeline.append(
                ('bin_cat_pp', binary_categorical_pipeline, self.binary_cat_attr))

        if self.cat_attr is not None:
            pipeline.append(('cat_pp', categorical_pipleline, self.cat_attr))

        preprocess_pipeline = ColumnTransformer(
            pipeline, remainder='passthrough')

        return preprocess_pipeline

    def make_preprocess_pipeline2(self):

        numerical_pipeline = make_pipeline(
            SimpleImputer(strategy='mean')
        )

        robust_numerical_pipeline = make_pipeline(
            SimpleImputer(strategy='most_frequent')
        )

        binary_categorical_pipeline = make_pipeline(
            SimpleImputer(strategy='most_frequent')
        )

        if self.one_hot_enc is None:
            categories = None
        else:
            categories = self.one_hot_enc.categories_

        categorical_pipleline = make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(categories=categories)
        )

        preprocess_pipeline = make_column_transformer(
            (numerical_pipeline, self.num_attr),
            (robust_numerical_pipeline, self.num_attr_with_outliers),
            (binary_categorical_pipeline, self.binary_cat_attr),
            (categorical_pipleline, self.cat_attr),
            remainder='passthrough')

        return preprocess_pipeline

    # Training pipeline
    def make_training_pipeline(self, ml_model):

        training_pipeline = make_pipeline(
            self.make_preprocess_pipeline(),
            ml_model
        )

        return training_pipeline

## Data Preparation Pipeline

In [None]:
class DataPreparator(BaseEstimator, TransformerMixin):

    def __init__(self, y_feature, test_size=0.20, suppress_print=False):
        self.num_attr = []
        self.cat_attr = []
        self.y_feature = y_feature
        self.test_size = test_size
        self.suppress_print = suppress_print

    def get_num_attr(self):
        return self.num_attr

    def get_cat_attr(self):
        return self.cat_attr

    def get_num_cat_attrs(self, num_dtype='float64'):

        num_dtypes = ['float64', 'int64']

        for col in list(self.data.columns):
            dtype = self.data[col].dtype
            if dtype == 'float32':
                self.num_attr.append(col)
            elif dtype in num_dtypes:
                self.data[col] = self.data[col].astype(num_dtype)
                self.num_attr.append(col)
            else:
                self.cat_attr.append(col)

        if self.y_feature in self.num_attr:
            self.num_attr.remove(self.y_feature)
        else:
            self.cat_attr.remove(self.y_feature)

        if not self.suppress_print:
            print(f'Numerical attributes: {self.num_attr}')
            print(f'Categorical attributes: {self.cat_attr}')

    def shuffle_data(self):
        self.data = self.data.iloc[np.random.permutation(
            len(self.data))].reset_index(drop=True)

    def split_data(self, test_size=None, combine_X_and_y=False):

        if test_size is None:
            test_size = self.test_size

        X = self.data.drop(columns=self.y_feature, axis=1)
        y = self.data[self.y_feature]

        split = StratifiedShuffleSplit(test_size=test_size,
                                       random_state=RANDOM_SEED)

        train_index, test_index = next(split.split(X, y))
        X_train, X_test, y_train, y_test = \
            X.iloc[train_index], X.iloc[test_index], y[train_index], y[test_index]

        print(f'Train X   : {X_train.shape}')
        print(f'Train y   : {y_train.shape}')
        print(f'Test X   : {X_test.shape}')
        print(f'Test y   : {y_test.shape}')

        if combine_X_and_y:
            train = pd.concat([X_train, y_train], axis=1)
            test = pd.concat([X_test, y_test], axis=1)
            return train, test

        return X_train, X_test, y_train, y_test

    def fit(self, data, y=None):
        self.data = data.copy()
        self.shuffle_data()
        self.get_num_cat_attrs()
        return self

    def transform(self, data=None, y=None):

        X_train, X_test, y_train, y_test = self.split_data()

        return X_train, X_test, y_train, y_test

# Model Selection

## Preparing Data


We are only going to train on 10% of the dataset to save time,
since overwhelmingly large samples will not improve models' performance in
any significant manner.

In [None]:
# Cleaning data
final_insurance_train = DataCleaner().fit_transform(insurance_train.copy())
final_insurance_train.shape

In [None]:
insurance_train2.info()

In [None]:
cat_attr = ['Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel']
preprocessed_attr = final_insurance_train.columns.tolist()
one_hot_encoded_attr = [list(filter(lambda x: re.match(
    f'^{attr}', x), preprocessed_attr)) for attr in cat_attr]
one_hot_encoded_attr = sum(one_hot_encoded_attr, [])

In [None]:
num_attr = ['Age']
num_attr_with_outliers = ['Annual_Premium']
binary_cat_attr = ['Gender_female', 'Vehicle_Damage',
                   'Previously_Insured']+one_hot_encoded_attr
cat_attr = None
Y_FEATURE = 'Response'

In [None]:
final_insurance_train.drop(
    columns=one_hot_encoded_attr, axis=1).columns.tolist()

In [None]:
# Train test split
X_train, X_test, y_train, y_test = DataPreparator(
    Y_FEATURE, test_size=0.90).fit_transform(final_insurance_train)

# Preparing training pipeline
preprocessor = DataPreprocessor(final_insurance_train, num_attr, cat_attr, Y_FEATURE,
                                binary_cat_attr, num_attr_with_outliers)
training_pipeline = preprocessor.make_training_pipeline

X_train.head(5)

## Preparing Classifiers

We are going to omit machine learning algorithms like
K Nearest Neighbors and Support Vector Machines since the
training samples are just too big.

In [None]:
short_names = ['log_reg', 'neural_network', 'decision_tree', 'rand_forest',
               'extra_tree', 'ada_boost_cf', 'gradient_b_cf', 'bagging_cf',
               'catboost_cf', 'xg_boost']

names = ['Logistic Classifier', 'Multi-layer Perceptron classifier',
         'Decision Tree', 'Random forest Classifier', 'Extra Tree Classifier',
         'AdaBoost Classifier', 'Gradient Boosting Classifier',
         'Bagging Classifier', 'CatBoost Classifier', 'XGBClassifier']

functions = [
    LogisticRegression(random_state=RANDOM_SEED, n_jobs=-1, max_iter=1000),
    MLPClassifier(random_state=RANDOM_SEED, early_stopping=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    ExtraTreesClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    AdaBoostClassifier(random_state=RANDOM_SEED),
    GradientBoostingClassifier(random_state=RANDOM_SEED),
    BaggingClassifier(random_state=RANDOM_SEED, n_jobs=-1),
    CatBoostClassifier(random_seed=RANDOM_SEED, silent=True),
    XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1)
]

classifiers_idx = {}
classifiers = {}

# Zip all classfiers together into a dictionary for convenient access
for idx, s_name, name, func in zip(range(len(names)), short_names, names, functions):
    classifiers_idx[idx] = {'name': name, 'func': func}
    classifiers[s_name] = {'name': name, 'func': func}

## Phase 1: Performance Measure

In [None]:
def get_models_performance(models, X, y, training_pipeline, n_splits,
                           scoring_metrics, random_state=RANDOM_SEED):

    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)

    cv = StratifiedKFold(n_splits=n_splits,
                         shuffle=True, random_state=random_state)

    mean_cols = []
    std_cols = []
    for name in ['train', 'test']:
        mean_cols += [f'{name}_{metric}' for metric in scoring_metrics]
        std_cols += [f'{name}_{metric}_std' for metric in scoring_metrics]
    cols = mean_cols + std_cols

    results = {'model_name': [], 'duration': []}

    for col in cols:
        results[col] = []

    # Loop through all models
    for idx in range(len(models)):
        cf_name = models[idx]['name']

        print(f'{cf_name} has started...')
        # Count time to get the duration of the models
        start = time.time()

        ml_pipeline = training_pipeline(clone(models[idx]['func']))
        # cross_validate returns both train_score and test_score by setting return_train_score to True
        cv_scores = cross_validate(ml_pipeline, X, y,
                                   scoring=scoring_metrics, cv=cv,
                                   return_train_score=True)

        end = time.time()
        duration = end - start
        print(f'{cf_name} ended in {duration} seconds.\n')

        updateRecord(results, cv_scores, mean_cols, std_cols,
                     cf_name, duration, scoring_metrics)

    # Return as DataFrame instead of dictionary
    return pd.DataFrame(results)

# Append values to the dictionary based on key_name passed into the function


def updateRecord(df, scores, mean_cols, std_cols, model_name, duration, scoring_metrics):
    df['model_name'].append(model_name)
    df['duration'].append(duration)
    for mean_col, std_col in zip(mean_cols, std_cols):
        df[mean_col].append(np.mean(scores[mean_col]))
        df[std_col].append(np.std(scores[mean_col]))


def sortValues(df, cols, sort_idx, ascending=False):
    df = df.copy()
    try:
        cols.remove('model_name')
        cols.remove('duration')
    except ValueError:
        pass
    regex = '(?:^.+)(_after|_before)$'
    for col in cols:
        match = re.search(regex, col)
        if not match:
            col_std = f'{col}_std'
        elif match.group(1) == '_before':
            col_std = f'{col[:-7]}_std_before'
        else:
            col_std = f'{col[:-6]}_std_after'
        df[col] = df[col].astype('float64')
        df[col_std] = df[col_std].astype('float64')
        def display(row): return f'{row[0]:.4f} +/-{row[1]:.4f}'
        df[col] = df[[col, col_std]].apply(display, axis=1)
    cols = np.array(cols)
    sort_cols = list(cols[sort_idx]) if isinstance(sort_idx, list) \
        else [cols[sort_idx]]
    df = df[['model_name'] +
            list(cols)].sort_values(sort_cols, ascending=ascending)
    return df

**Note**: Results are loaded from the pickle file because it takes time to run. Please run the code below if you insist.

```python
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

scoring_metrics = ['f1', 'roc_auc', 'precision', 'recall']
performance_results = get_models_performance(classifiers_idx, X_train, y_train,
                                             training_pipeline, 5, scoring_metrics)

dump_objects('performance', performance_results)
```

In [None]:
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

scoring_metrics = ['f1', 'roc_auc', 'precision', 'recall']

[performance_results] = load_objects('performance')

**Models like Gradient Boosting Classifier, AdaBoost Classifier, Multi-layer Perceptron Classifier** perform and generalise well.

However, **Random Forest Classifier, Extra Tree Classifier, Bagging Classifier and Decision Tree** severely
overfit the train dataset.

So we try if increasing train size from 10% to 50% will improve these classifiers.

In [None]:
roc = ['train_roc_auc', 'test_roc_auc']
sortValues(performance_results, roc, 1)

In [None]:
X2_train, X2_test, y2_train, y2_test = DataPreparator(
    Y_FEATURE, test_size=0.50).fit_transform(final_insurance_train)

classifiers_idx2 = {}
for idx, key in enumerate([2, 3, 4, 7]):
    classifiers_idx2[idx] = classifiers_idx[key]

**Note**: Results are loaded from the pickle file because it takes time to run. Please run the code below if you insist.

```python
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

performance_results2 = get_models_performance(classifiers_idx2, X2_train, y2_train,
                                              training_pipeline, 5, scoring_metrics)

dump_objects('performance2', performance_results2)
```

In [None]:
[performance_results2] = load_objects('performance2')

The results are quite depressing haha. Well, we stick with train size of 10%.

In [None]:
roc = ['train_roc_auc', 'test_roc_auc']
sortValues(performance_results2, roc, 1)

<p style="text-align: justify; line-height: 2.0;">
I will choose Logistic Classifier, XGB Classifier, Random Forest Classifier.
Besides, I also going to tune XGB Classfier and Random Forest Classifier since they have higher tendencies to
outperform most models after some hyperparameter tweaking.
</p>

## Feature Selection

<p style="text-align: justify; line-height: 2.0;">
I would like to explore some of the feature selection method available in Scikit-Learn API.
I want to know if these methods are useful for dimensionality reduction without sacrificing too much on
overall models' performance.
</p>

Two popular feature selection techniques that can be used for numerical input data and a categorical (class) target variable:
- ANOVA F-test
- Mutual Information Statistics

Since we have already try ANOVA F-test just now, we will try Mutual Information Statistics now.

Estimate mutual information for a discrete target variable.

<p style="text-align: justify; line-height: 2.0;">
Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
</p>

[Source](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)

**Note**: Results are loaded from the pickle file because it takes time to run. Please run the code below if you insist.

```python
from sklearn.feature_selection import mutual_info_classif

results = mutual_info_classif(X_train, y_train, discrete_features='auto', n_neighbors=3, random_state=RANDOM_SEED)
result_arr = np.column_stack((list(X_train.columns), results))

MI = 'Mutual Information'
MI_results = pd.DataFrame(result_arr, columns=['col_name', MI])
dump_objects('MI_results', MI_results)
```

We try and see if removing features with value 0 in mutual information will help in
reducing model complexity without introducing too much bias to our model.

In [None]:
MI = 'Mutual Information'
[MI_results] = load_objects('MI_results')

MI_results[MI] = MI_results[MI].astype('float64')
MI_useful = MI_results.loc[MI_results[MI] != 0.0]
MI_useful = MI_useful.sort_values(MI, ascending=False).reset_index(drop=True)
print(f'Total of useful features: {len(MI_useful)}.')
MI_useful.head(20)

In [None]:
useful_features = list(MI_useful['col_name'])
final_insurance_train2 = final_insurance_train[useful_features+[
    Y_FEATURE]].copy()

# Train test split
data_prep2 = DataPreparator(Y_FEATURE, test_size=0.90)
X2_train, X2_test, y2_train, y2_test = data_prep2.fit_transform(
    final_insurance_train2)
num_attr2, cat_attr2 = data_prep2.get_num_attr(), data_prep2.get_cat_attr()

num_attr2.remove('Annual_Premium')
num_attr_with_outliers2 = ['Annual_Premium']

preprocessor2 = DataPreprocessor(final_insurance_train2, num_attr2, None, Y_FEATURE,
                                 cat_attr2, num_attr_with_outliers2)
training_pipeline2 = preprocessor2.make_training_pipeline

**Note**: Results are loaded from the pickle file because it takes time to run. Please run the code below if you insist.

```python
performance_results3 = get_models_performance(classifiers_idx, X2_train, y2_train,
                                             training_pipeline2, 5, scoring_metrics)

dump_objects('performance3', performance_results3)
```

In [None]:
[performance_results3] = load_objects('performance3')

Hmm, by comparing the *performance_result3* with the *performance_result*, we notice a little bit of performance boost.

<p style="text-align: justify; line-height: 2.0;">
So, I am going to choose Gradient Boosting Classifier, Multi-layer Perceptron classifier, Logistic Classifier,
    XGB Classifier, and Random Forest Classifier.
Besides, I also going to tune XGB Classfier and Random Forest Classifier since they have higher tendencies to
outperform most models after some hyperparameter tweaking.
</p>

In [None]:
sortValues(performance_results3, roc, 1)

In [None]:
sortValues(performance_results, roc, 1)

Reset train and test set since we found a better combination of features.

In [None]:
useful_features = list(MI_useful['col_name'])
final_insurance_train = final_insurance_train[useful_features+[
    Y_FEATURE]].copy()

# Train test split
data_prep = DataPreparator(Y_FEATURE, test_size=0.90)
X_train, X_test, y_train, y_test = data_prep.fit_transform(
    final_insurance_train)
num_attr, cat_attr = data_prep.get_num_attr(), data_prep.get_cat_attr()

preprocessor = DataPreprocessor(final_insurance_train, num_attr, None, Y_FEATURE,
                                cat_attr, num_attr_with_outliers)
training_pipeline = preprocessor.make_training_pipeline

## Hyparameter Tunning

<p style="text-indent: 5.0%; text-align: justify; line-height: 2.0;">
We are going to use randomized search instead of grid search since it takes too much time to go through all possibilities.
</p>

<p style="text-indent: 5.0%; text-align: justify; line-height: 2.0;">
We are going to choose the best combination of hyperparameters sorted by the num_trials, mean_cv_score and mean_test_score. Assuming X_train and y_train is the intial training set we feed into the algorithm, the algorithm will perform nested cross validated randomized search by splitting X_train and y_train into k1-outer-fold X_outer_train and y_outer_train. Then, the algorithm will split X_outer_train and y_outer_train into k2-inner-fold X_inner_train and y_inner_train. num_trials is the number of times a combination of hyparameters explored by the randomized search during the k2-inner-fold sets. mean_test_score is the average test score for each k2-inner-fold sets for each hyperparameters combination. mean_cv_score is the average cross validated score across all k1-outer-fold sets for each hyperparameters combination.
</p>

<p style="text-indent: 5.0%; text-align: justify; line-height: 2.0;">
To put it simply, k2-inner-fold sets are used to find the hyperparameter's combination of the best estimator, while the k1-outer fold sets are used to evaluate that best estimator with that hyperparameter's combination. The purpose is to prevent the randomized search from producing overly optimistic results, which cause the model to overfit the original training set and does not generalize well to real-world data with different variations and distributions.
</p>

In [None]:
def nested_cv_param_search(training_pipeline, param_grid, X, y,
                           n_iter, scoring, n_outer_splits, n_inner_spits,
                           random_state=RANDOM_SEED):

    cv_outer = StratifiedKFold(n_splits=n_outer_splits, shuffle=False)
    cv_inner = StratifiedKFold(n_splits=n_inner_spits, shuffle=True,
                               random_state=random_state)

    outer_roc_score = list()
    inner_roc_score = list()

    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)

    with tqdm(total=100) as pbar:
        progress_unit = 75/(n_outer_splits)

        for train_ix, test_ix in cv_outer.split(X, y):
            X_train, X_test = X.loc[train_ix, :], X.loc[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]

            search = RandomizedSearchCV(training_pipeline, param_grid, n_iter=n_iter,
                                        scoring=scoring, cv=cv_inner, refit=True)
            result = search.fit(X_train, y_train)

            inner_roc_score.append(result.cv_results_)

            best_model = result.best_estimator_
            y_test_pred = best_model.predict(X_test)
            roc_score = roc_auc_score(y_test, y_test_pred)
            outer_roc_score.append(roc_score)

            pbar.update(progress_unit)

        features = ['params', 'mean_test_score', 'std_test_score']
        base = pd.DataFrame()

        for roc_score in inner_roc_score:
            roc_score = pd.DataFrame(roc_score)[features]
            base = base.append(roc_score, ignore_index=True)

        base['params'] = base['params'].astype('str')
        agg_mean = base.groupby('params')['mean_test_score']
        new_df = {'mean_test_score': agg_mean.mean(),
                  'std_test_score': agg_mean.std(), 'num_trials': agg_mean.count()}
        param_result = pd.DataFrame(new_df).reset_index()

        param_result['mean_cv_score'] = 0
        param_result['std_cv_score'] = 0

        params = list(param_result['params'].value_counts().index)
        progress_unit = 25/(n_outer_splits * len(params))

        for param in params:
            outer_roc_score = []
            for train_ix, test_ix in cv_outer.split(X, y):
                X_train, X_test = X.loc[train_ix, :], X.loc[test_ix, :]
                y_train, y_test = y[train_ix], y[test_ix]

                training_pipeline.set_params(**ast.literal_eval(param))
                result = training_pipeline.fit(X_train, y_train)

                y_test_probas = training_pipeline.predict_proba(X_test)
                roc_score = roc_auc_score(y_test, y_test_probas[:, -1])
                outer_roc_score.append(roc_score)

                pbar.update(progress_unit)

            mean = np.mean(outer_roc_score)
            std = np.std(outer_roc_score)
            index = list(param_result['params'] == param).index(True)
            param_result.loc[index, 'mean_cv_score'] = mean
            param_result.loc[index, 'std_cv_score'] = std

    outer_roc_score = pd.DataFrame(
        outer_roc_score, columns=[f'{scoring}_score'])
    return param_result

<p style='text-align:justify;line-height:2.0;'>
We are going to manually search a reasonable range of hyperparameters by plotting the validation curve for each hyperparameters.
By doing so, we are able to reduce the burden of randomized search and produce more optimal results.
</p>

In [None]:
def plot_validation_curve(model, X, y, param_name, param_range,
                          cv=3, scoring='roc_auc', figsize=(6, 6),
                          random_state=RANDOM_SEED):

    cv = StratifiedKFold(n_splits=cv, shuffle=True,
                         random_state=random_state)

    # Compute train and test scores for an estimator with different values of the specified parameter
    train_score, test_score = validation_curve(model, X, y, param_name=param_name,
                                               param_range=param_range, cv=cv, scoring=scoring,
                                               verbose=1, n_jobs=-1)
    avg_train_score = train_score.mean(axis=1)
    avg_test_score = test_score.mean(axis=1)

    fig, ax = plt.subplots(figsize=figsize)
    # Adjust settings for the plot (eg. set title of the plot)
    ax.plot(param_range, avg_train_score, label="Training Score")
    ax.plot(param_range, avg_test_score, label="Cross-Validation Score")
    ax.set(xlabel=param_name, ylabel=scoring,
           title=f'Cross Validation Curve {scoring} against {param_name}')
    ax.legend(loc="best")
    ax.grid()
    plt.show()

    # Get the index position of the value of the specified parameter,
    # when it produces the highest test_score
    max_val_index = np.argsort(avg_test_score)[-1]

    # Print the value of the specified parameter with highest test_score
    print(f'{param_name} with value {param_range[max_val_index]}' +
          f' generates the highest CV score: {avg_test_score[max_val_index]:.4f}')

## Model 1: Random Forest Classifier

In [None]:
rand_forest = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1)

In [None]:
key = 'randomforestclassifier__n_estimators'
rand_forest_param = [10, 50, 100, 150, 200]

plot_validation_curve(training_pipeline(rand_forest),
                      X_train, y_train, key, rand_forest_param)

In [None]:
key = 'randomforestclassifier__max_depth'
rand_forest_param = [10, 30, 50, 70, 90]

plot_validation_curve(training_pipeline(rand_forest),
                      X_train, y_train, key, rand_forest_param)

In [None]:
key = 'randomforestclassifier__min_samples_split'
rand_forest_param = [10, 100, 300, 500, 700, 900]

plot_validation_curve(training_pipeline(rand_forest),
                      X_train, y_train, key, rand_forest_param)

In [None]:
key = 'randomforestclassifier__min_samples_leaf'
rand_forest_param = [1, 2, 5, 10, 20, 30, 40]

plot_validation_curve(training_pipeline(rand_forest),
                      X_train, y_train, key, rand_forest_param)

We are going to tune *max_depth, min_samples_split, and min_samples_leaf* for Random Forest Classifier.

In [None]:
rand_forest = RandomForestClassifier(
    n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)

rand_forest_param_grid = {
    'randomforestclassifier__max_depth': [10, 20, 30],
    'randomforestclassifier__min_samples_split': [200, 300, 400],
    'randomforestclassifier__min_samples_leaf': [5, 10, 15],
}

Here's the nested cross validated scores from randomized search for each explored hyperparameters' combinations for Random Forest Classifier.

**Note**: Results are loaded from the pickle file because it takes time to run. Please run the code below if you insist.

```python
rand_forest_param_result = nested_cv_param_search(training_pipeline(rand_forest), rand_forest_param_grid,
                                                  X_train, y_train, 9, 'roc_auc', 5, 5)

dump_objects('rand_forest_cf_cv_rand_search', rand_forest_param_result)
```

In [None]:
[rand_forest_param_result] = load_objects('rand_forest_cf_cv_rand_search')
rand_forest_param_result

In [None]:
rand_forst_param_ranked = rand_forest_param_result.sort_values(
    ['mean_test_score', 'mean_cv_score'], ascending=False).reset_index(drop=True)
rand_forst_param_ranked

The best combination of hyperparameters for Random Forest Classifier.

In [None]:
rand_forest_best_param = rand_forst_param_ranked.loc[1, 'params']
rand_forest_best_param

## Model 2: XGBoost Classifier

In [None]:
xgb_cf = XGBClassifier(random_state=RANDOM_SEED, n_jobs=-1)

In [None]:
key = 'xgbclassifier__n_estimators'
xgb_boost_param = [5, 10, 15, 30, 60]

plot_validation_curve(training_pipeline(xgb_cf), X_train,
                      y_train, key, xgb_boost_param)

In [None]:
key = 'xgbclassifier__max_depth'
xgb_boost_param = [5, 10, 15, 20, 25, 30]

plot_validation_curve(training_pipeline(xgb_cf), X_train,
                      y_train, key, xgb_boost_param)

In [None]:
key = 'xgbclassifier__learning_rate'
xgb_boost_param = [0.01, 0.05, 0.08, 0.12, 0.15]

plot_validation_curve(training_pipeline(xgb_cf), X_train,
                      y_train, key, xgb_boost_param)

In [None]:
key = 'xgbclassifier__min_child_weight'
xgb_boost_param = [5, 15, 20, 25, 30, 50, 80, 100]

plot_validation_curve(training_pipeline(xgb_cf), X_train,
                      y_train, key, xgb_boost_param)

In [None]:
key = 'xgbclassifier__subsample'
xgb_boost_param = [0.9, 0.99, 0.999, 0.9999]

plot_validation_curve(training_pipeline(xgb_cf), X_train,
                      y_train, key, xgb_boost_param)

In [None]:
key = 'xgbclassifier__colsample_bytree'
xgb_boost_param = [0.1, 0.3, 0.5, 0.7, 0.9]

plot_validation_curve(training_pipeline(xgb_cf), X_train,
                      y_train, key, xgb_boost_param)

We are going to tune *min_child_weight, learning_rate, and max_depth* for XGBoost Classifier.

In [None]:
xgb_cf = XGBClassifier(n_estimators=50, colsample_bytree=0.3,
                       random_state=RANDOM_SEED, n_jobs=-1)

xgb_cf_param_grid = {
    'xgbclassifier__min_child_weight': [10, 25, 40],
    'xgbclassifier__learning_rate': [0.01, 0.05, 0.08],
    'xgbclassifier__max_depth': [5, 10]
}

Here's the nested cross validated scores from randomized search for each explored hyperparameters' combinations for XGBoost Classifier.

**Note**: Results are loaded from the pickle file because it takes time to run. Please run the code below if you insist.

```python
xgb_cf_param_result = nested_cv_param_search(training_pipeline(xgb_cf), xgb_cf_param_grid,
                                             X_train, y_train, 8, 'roc_auc', 5, 5)

dump_objects('xgb_cf_cv_rand_search', xgb_cf_param_result)
```

In [None]:
[xgb_cf_param_result] = load_objects('xgb_cf_cv_rand_search')
xgb_cf_param_result

In [None]:
xgb_cf_param_ranked = xgb_cf_param_result.sort_values(
    ['mean_test_score', 'mean_cv_score'], ascending=False).reset_index(drop=True)
xgb_cf_param_ranked

The best combination of hyperparameters for XGBoost Classifier.

In [None]:
xgb_cf_best_param = xgb_cf_param_ranked.loc[0, 'params']
xgb_cf_best_param

# Testing the Final Models

Now, we will set the optimal hyperparameter based on the results from randomized search just now.

In [None]:
def set_optimal_param(model, param):
    new_model = clone(model)
    param = re.sub('(?<=\')[^_,]+__(?=.+\')', '', param)
    new_model.set_params(**ast.literal_eval(param))
    return new_model

Now, we are going to combine Gradient Boosting Classifier, Multi-layer Perceptron classifier, Logistic Classifier,
tuned XGB Classifier, and tuned Random Forest Classifier.

In [None]:
optimal_xg_boost = set_optimal_param(
    classifiers['xg_boost']['func'], xgb_cf_best_param)

optimal_rand_forest = set_optimal_param(
    classifiers['rand_forest']['func'], rand_forest_best_param)

optimized_models = [
    classifiers['gradient_b_cf']['func'],
    classifiers['neural_network']['func'],
    classifiers['log_reg']['func'],
    optimal_rand_forest,
    optimal_xg_boost
]

o_short_names = ['gradient_b_cf', 'neural_network',
                 'log_reg', 'rand_forest', 'xg_boost']

o_names = ['Gradient Boosting Classifier', 'Multi-layer Perceptron classifier', 'Logistic Classifier',
           'Random forest Classifier', 'XGBClassifier']

optimized_cf_idx = {}
optimized_cf = {}

# Zip all classfiers together into a dictionary for convenient access
for idx, s_name, name, model in zip(range(len(names)), o_short_names, o_names, optimized_models):
    optimized_cf_idx[idx] = {'name': name, 'func': model}
    optimized_cf[s_name] = {'name': name, 'func': model}

Let's see if the final models have any performance changes...

**Note**: Results are loaded from the pickle file because it takes time to run. Please run the code below if you insist.

```python
optimized_cf_idx2 = {key: optimized_cf_idx[key] for key in range(5)}

performance_results4 = get_models_performance(optimized_cf_idx2, X_train, y_train,
                                              training_pipeline=training_pipeline,
                                              scoring_metrics=scoring_metrics, n_splits=5)

dump_objects('performance4', performance_results4)
```

In [None]:
[performance_results4] = load_objects(file_name='performance4')
performance_results4

In [None]:
sortValues(performance_results4, roc, 1)

In [None]:
sortValues(performance_results3, roc, 1)

In [None]:
f1 = ['train_f1', 'test_f1']
sortValues(performance_results4, f1, 1)

In [None]:
sortValues(performance_results3, f1, 1)

In [None]:
precision_recall = ['train_precision',
                    'test_precision', 'train_recall', 'test_recall']
sortValues(performance_results4, precision_recall, [1, 3])

In [None]:
sortValues(performance_results3, precision_recall, [1, 3])

## Analyzing Final Models

In [None]:
def plot_precision_vs_recall(classifier, cf_name, X_train, y_train, ax, method, label=False):

    # get accurate y_scores using cross_val_predict, not from overfitted models
    # y_scores are generate using 'predict_proba' method of each models,
    # therefore probabilities of each class (total of 2) are returned
    y_scores_cv = cross_val_predict(classifier, X_train, y_train,
                                    cv=3, method=method, n_jobs=-1)

    # Get the last columns of the y_scores only if more than one columns are detected
    if y_scores_cv.ndim > 1:
        y_scores_cv = y_scores_cv[:, -1]

    precisions, recalls, thresholds = precision_recall_curve(
        y_train, y_scores_cv)

    # Adjust settings for the plot (eg. set title of the plot)
    if label:
        label_name = cf_name
    else:
        label_name = None
    ax.plot(recalls, precisions, label=label_name)
    ax.set(xlabel='recall', ylabel='precision',
           title=f'PR Curve for {cf_name}')
    ax.title.set_fontsize(16)
    ax.grid()

    return precisions, recalls, thresholds

In [None]:
def plot_precision_vs_recall2(model, model_name, X, y, threshold, ax, method='predict_proba'):

    # Call custom function plot_precision_vs_recall,
    # to get precisions, recalls, thresholds and
    # plot precision-recall curve
    precisions, recalls, thresholds = plot_precision_vs_recall(model, model_name,
                                                               X, y, ax, method)

    # Get the index position of first recall value >= threshold
    best_idx = np.argmin(recalls >= threshold)
    selected_threshold = thresholds[best_idx]
    selected_precision = precisions[best_idx]
    selected_recall = recalls[best_idx]

    # Adjust settings for the plot (eg. set title of the plot)
    ax.set(title='Precision-Recall Graph')
    ax.title.set_fontsize(20)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1.2)

    # Annotate the chosen recall and preicsion in the form of (x, y)
    x_coord = recalls[best_idx]
    y_coord = precisions[best_idx]
    ax.annotate(f'({x_coord:.2f}, {y_coord:.2f})',
                (x_coord + 0.010, y_coord + 0.010))

    # Plot the line to show the location of chosen recall and preicsion
    ax.plot([selected_recall, selected_recall], [0, selected_precision], "r:")
    ax.plot([0, selected_recall], [
            selected_precision, selected_precision], "r:")
    ax.plot([selected_recall], [selected_precision], "ro")

    print("Selected Threshold = {:f}".format(selected_threshold))
    print("Selected Precision = {:.2f}".format(selected_precision))
    print("Selected Recall    = {:.2f}".format(selected_recall))

    return precisions, recalls, thresholds

In [None]:
# Create 3 X 2 subplots to plot 6 graphs in one figure
num_row = 2
num_col = 3
fig, axs = plt.subplots(num_row, num_col)
fig.set_size_inches(15, 10)

indexes = list(product(range(num_row), range(num_col)))[:len(optimized_cf)]

with tqdm(total=100) as pbar:
    progress_unit = 100/len(optimized_cf)
    # Iterate all 9 classifiers to produce the graphs
    for index, classifier in zip(indexes, optimized_cf.values()):
        plot_precision_vs_recall(training_pipeline(classifier['func']), classifier['name'],
                                 X_train, y_train, axs[index[0]][index[1]],
                                 method='predict_proba')
        pbar.update(progress_unit)

fig.tight_layout()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
roc_scores = []

with tqdm(total=100) as pbar:
    progress_unit = 100/len(optimized_cf)
    # Iterate all classifiers to plot on the same axis
    for classifier in optimized_cf.values():

        new_cf = clone(classifier['func'])
        method = 'predict_proba'

        # get cross_validated y_score of training set from cross_val_predict,
        # without having to fit the whole training set or use test set
        y_score_cv = cross_val_predict(training_pipeline(
            new_cf), X_train, y_train, cv=3, method=method)

        # Get the last columns of the y_scores only if more than one columns are detected
        if y_score_cv.ndim > 1:
            y_score_cv = y_score_cv[:, -1]

        # Plot the ROC curve
        fpr, tpr, threshold = roc_curve(y_train, y_score_cv)
        roc_auc = auc(fpr, tpr)

        graph = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                estimator_name=classifier['name'])
        graph.plot(ax=ax)

        roc_scores.append({'name': classifier['name'], 'auc_score': roc_auc})
        pbar.update(progress_unit)

ax.set(title="Receiver operating characteristic with cross validation")
ax.legend(loc="lower right")
plt.show()

# Combining the Models

Now, we are going to ensemble all the models into a single unified hard voting classifier. Let's see if having a ensemble of models is better.

In [None]:
class RobustVotingClassifier(RegressorMixin, BaseEstimator):

    def __init__(self, optimized_models=[], keras_models=[], fitted_models=[]):
        self.optimized_models = optimized_models
        self.keras_models = keras_models
        self.fitted_models = fitted_models
        self.init()

    def init(self):
        self.trained_models = []

    def fit(self, X, y):
        for model in self.optimized_models:
            new_model = clone(model)
            new_model.fit(X, y)
            self.trained_models.append(new_model)
        self.models = self.trained_models + self.keras_models + self.fitted_models
        return self

    def predict(self, X=None):
        y_preds = []
        if type(X) == csr_matrix:
            X = X.toarray()
        num_models = len(self.models)
        for model in self.models:
            y_pred = model.predict(X)
            y_preds.append(y_pred)

        positive = np.sum(y_preds, axis=0) >= ceil(num_models / 2)
        results = np.zeros(len(X))
        results[positive] = 1.0

        return results

    def predict_proba(self, X=None):
        y_probas = []
        if type(X) == csr_matrix:
            X = X.toarray()
        for model in self.models:
            y_proba = model.predict_proba(X)
            if y_proba.shape[1] > 1:
                y_proba = y_proba[:, 1]
            y_probas.append(y_proba)

        results = np.mean(y_probas, axis=0)

        return results

In [None]:
vote_ensemble_cf = RobustVotingClassifier(optimized_models)
ensemble_pp = training_pipeline(vote_ensemble_cf)
ensemble_pp = ensemble_pp.fit(X_train, y_train)

Wow beautiful, look at that ROC score, 0.85! Phew, no overfitting or underfitting issues :)

In [None]:
y_proba = ensemble_pp.predict_proba(X_test)

final_auc = roc_auc_score(y_test, y_proba)
print(f'The ROC_AUC score after combining the final model is {final_auc}')

# Prepare for Submission

Submit the Kaggle Task! Woohoo!

In [None]:
insurance_test = fetch_data(TEST_DATA_PATH)
insurance_test.head(5)

In [None]:
attrs = ['Region_Code', 'Policy_Sales_Channel']
for attr in attrs:
    set1 = set(insurance_train[attr].value_counts().index.tolist())
    set2 = set(insurance_test[attr].value_counts().index.tolist())
    dff = set2.difference(set1)
    if len(dff) == 0:
        print(f'{attr} in train set contains all the values in test set.')
    else:
        print(f'{attr} in train set doesn\'t contain values which are {list(dff)}, which these values are found in test set.')

In [None]:


class DataCleaner2(BaseEstimator, TransformerMixin):

    def __init__(self, features_to_be_remained=None):
        self.features_to_be_remained = features_to_be_remained

    def fit(self, data, y=None):
        return self

    def transform(self, data, y=None):

        data = data.copy()
        data = data.reset_index(drop=True)
        data = data.drop(columns='id', axis=1)

        index = data['Gender'] == 'Male'
        data['Gender_female'] = 1.0
        data.loc[index, 'Gender_female'] = 0.0

        data = data.drop(columns='Gender', axis=1)

        index = data['Vehicle_Damage'] == 'Yes'
        data.loc[~index, 'Vehicle_Damage'] = 0.0
        data.loc[index, 'Vehicle_Damage'] = 1.0
        data['Vehicle_Damage'] = data['Vehicle_Damage'].astype('float64')

        binary_cat_attr = ['Driving_License', 'Gender_female',
                           'Vehicle_Damage', 'Previously_Insured']
        cat_attr = ['Policy_Sales_Channel', 'Vehicle_Age', 'Region_Code']

        data[cat_attr] = data[cat_attr].astype('object')

        data = pd.get_dummies(data.copy(), columns=cat_attr)

        preprocessed_attr = data.columns.tolist()
        one_hot_encoded_attr = [list(filter(lambda x: re.match(
            f'^{attr}', x), preprocessed_attr)) for attr in cat_attr]
        one_hot_encoded_attr = sum(one_hot_encoded_attr, [])

        data = data[self.features_to_be_remained]

        return data

In [None]:
insurance_test_cleaned = DataCleaner2(
    X_train.columns.tolist()).fit_transform(insurance_test.copy())
insurance_test_cleaned.shape

In [None]:
sample_submission = fetch_data(SAMPLE_SUBMISSION_PATH)
sample_submission.head(5)

In [None]:
y_proba = ensemble_pp.predict_proba(insurance_test_cleaned)
my_submission = pd.DataFrame(
    zip(insurance_test['id'].tolist(), y_proba), columns=sample_submission.columns)
my_submission.head(5)

In [None]:
my_submission.to_csv('my_submission.csv')