# Ensemble Learning

L'objectif est de construire un modèle englobant des modèles plus simples pour améliorer les résultats globaux par un système de soft voting.
Modèles à utiliser :
* DeepLearning
* RandomForest
* XGB
* AdaBoost
* LogisticRegression
* AutoFeat ?

Imports and functions
========

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics

In [2]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from autofeat import AutoFeatClassifier

In [3]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from scipy.stats import reciprocal

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import joblib

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [6]:
import tensorflow as tf
from tensorflow import keras

In [7]:
def numerical_impute(data, numerical_list):
    imputer_numerical = SimpleImputer(strategy='constant', fill_value=-1, missing_values=np.nan)
    data_numerical = data.loc[:, numerical_list]
    data_numerical_imputed = imputer_numerical.fit_transform(data_numerical)
    data_numerical_imputed = pd.DataFrame(data_numerical_imputed, columns=numerical_list)
    return data_numerical_imputed

def categorical_imputing(data, categorical_list):
    # Imputing
    imputer_categorical = SimpleImputer(strategy='constant', fill_value='missing', missing_values=np.nan)
    data_categorical = data.loc[:, categorical_list]
    data_categorical = imputer_categorical.fit_transform(data_categorical)
    data_categorical_imputed = pd.DataFrame(data_categorical, columns=categorical_list)
    return data_categorical_imputed

def categorical_impute_one_hot(data, categorical_list):
    # Imputing
    data_categorical_imputed = categorical_imputing(data, categorical_list)

    # One hot encoding
    data_one_hot = pd.get_dummies(data_categorical_imputed)

    return data_one_hot

def categorical_impute_ordinal(data, categorical_list):
    # Imputing
    data_categorical_imputed = categorical_imputing(data, categorical_list)

    # Ordinal encoding
    ordinal_encoder = OrdinalEncoder()
    data_ordinal = ordinal_encoder.fit_transform(data_categorical_imputed)
    data_ordinal = pd.DataFrame(data_ordinal, columns=categorical_list)

    return data_ordinal

def data_clean(data, numerical_list, categorical_list, encoding='one_hot'):
    # Changer les listes de features et les fonctions correspondantes
    if encoding == 'ordinal':
        data_categorical_encoded = categorical_impute_ordinal(data, categorical_list)
    else :
        data_categorical_encoded = categorical_impute_one_hot(data, categorical_list)
    data_numerical_imputed = numerical_impute(data, numerical_list)
    data_imputed_encoded = pd.merge(data_numerical_imputed, data_categorical_encoded, left_index=True, right_index=True)

    return data_imputed_encoded

Data prep
=======

In [8]:
FEATURES_TRAINING_PATH = "training_set_features.csv"
LABELS_TRAINING_PATH = "training_set_labels.csv"

features = pd.read_csv(FEATURES_TRAINING_PATH, sep=",", header=0)
labels = pd.read_csv(LABELS_TRAINING_PATH, sep=",", header=0)
data_original = pd.merge(features, labels, on="respondent_id")
respondent_id = data_original.loc[:, ['respondent_id']]
data_original.drop("respondent_id", axis=1, inplace=True)

In [9]:
data = data_original.copy()
arg_list = list(data.keys())
features_list = arg_list.copy()
features_list.remove("h1n1_vaccine")
features_list.remove("seasonal_vaccine")

labels_list = ['h1n1_vaccine', 'seasonal_vaccine']

categorical_list = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa','employment_industry', 'employment_occupation']

categorical_list_one_hot = ['race', 'sex', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']

categorical_list_ordinal = [k for k in categorical_list if k not in categorical_list_one_hot]

numerical_list = [k for k in features_list if k not in categorical_list]

In [10]:
data_encoded = data_clean(data, numerical_list, categorical_list, encoding='one_hot')

In [11]:
labels.drop("respondent_id", axis=1, inplace=True)
Y = labels.to_numpy()
X = data_encoded.to_numpy()

shape_train_data = X.shape[1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1, test_size=0.2)
# X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, random_state=1, test_size=0.2)

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

Maintenant X contient les données scaled et encodées par one hot et Y les deux labels

Entrainnement des modèles qui constitueront l'ensemble
===============

RandomForest
--------

In [19]:
params = {
    'n_estimators' : [30, 100, 300, 500],
    'max_features' : [4, 8, 12],
    'max_leaf_nodes' : [2, 4, 6]
}

In [26]:
rndf_clf = RandomForestClassifier()
grid_rndf = GridSearchCV(rndf_clf, param_grid=params, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
grid_rndf.fit(X_train_scaled, Y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': [4, 8, 12],
                         'max_leaf_nodes': [2, 4, 6],
                         'n_estimators': [30, 100, 300, 500]},
             return_train_score=True, scoring='neg_mean_squared_error')

In [30]:
grid_rndf.best_params_

{'max_features': 8, 'max_leaf_nodes': 6, 'n_estimators': 300}

In [28]:
grid_test_pred = grid_rndf.predict_proba(X_test_scaled)
h1n1_grid_pred = grid_test_pred[0][:, 1]
seasonal_grid_pred = grid_test_pred[1][:, 1]
grid_pred = np.c_[h1n1_grid_pred, seasonal_grid_pred]
roc_auc_score(Y_test, grid_pred)

0.8285420134411676

In [13]:
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train_scaled, Y_train)

RandomForestClassifier()

In [14]:
rnd_test_pred = rnd_clf.predict_proba(X_test_scaled)
h1n1_pred = rnd_test_pred[0][:,1]
seasonal_pred = rnd_test_pred[1][:,1]
rnd_test_pred = np.c_[h1n1_pred, seasonal_pred]
roc_auc_score(Y_test, rnd_test_pred)

0.8574289911081037

ROCAUC score : 0.857 (sans grid search et CV)

XGBoost
---

In [16]:
xgb_clf_h1n1 = XGBClassifier()
xgb_clf_h1n1.fit(X_train_scaled, Y_train[:, 0])





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
xgb_clf_seasonal = XGBClassifier()
xgb_clf_seasonal.fit(X_train_scaled, Y_train[:, 1])





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
xgb_clf_h1n1_pred = xgb_clf_h1n1.predict_proba(X_test_scaled)[:, 1]
xgb_clf_seasonal_pred = xgb_clf_seasonal.predict_proba(X_test_scaled)[:, 1]
xgb_pred = np.c_[xgb_clf_h1n1_pred, xgb_clf_seasonal_pred]
roc_auc_score(Y_test, xgb_pred)

0.8586314822015296

ROCAUC score : 0.858

AdaBoost
------

In [34]:
ada_clf_h1n1 = AdaBoostClassifier()
ada_clf_h1n1.fit(X_train_scaled, Y_train[:, 0])

AdaBoostClassifier()

In [36]:
ada_clf_seasonal = AdaBoostClassifier()
ada_clf_seasonal.fit(X_train_scaled, Y_train[:, 1])

AdaBoostClassifier()

In [37]:
ada_clf_h1n1_pred = ada_clf_h1n1.predict_proba(X_test_scaled)[:, 1]
ada_clf_seasonal_pred = ada_clf_seasonal.predict_proba(X_test_scaled)[:, 1]
ada_pred = np.c_[ada_clf_h1n1_pred, ada_clf_seasonal_pred]
roc_auc_score(Y_test, ada_pred)

0.8637746854415985

ROCAUC score : 0.864

CatBoost
--------

In [46]:
cb_clf_h1n1 = CatBoostClassifier()
cb_clf_h1n1.fit(X_train_scaled, Y_train[:, 0], verbose=0)

<catboost.core.CatBoostClassifier at 0x2126975fb80>

In [47]:
cb_clf_seasonal = CatBoostClassifier()
cb_clf_seasonal.fit(X_train_scaled, Y_train[:, 1], verbose=0)

<catboost.core.CatBoostClassifier at 0x2126975fdf0>

In [49]:
cb_clf_h1n1_pred = cb_clf_h1n1.predict_proba(X_test_scaled)[:, 1]
cb_clf_seasonal_pred = cb_clf_seasonal.predict_proba(X_test_scaled)[:, 1]
cb_pred = np.c_[cb_clf_h1n1_pred, cb_clf_seasonal_pred]
roc_auc_score(Y_test, cb_pred)

0.8710041487146035

ROCAUC score : 0.871

LogisticRegression
--------

Se renseigner sur le paramètres "multinomial" ?

In [26]:
lr_clf_h1n1 = LogisticRegression()
lr_clf_h1n1.fit(X_train_scaled, Y_train[:, 0])

LogisticRegression()

In [27]:
lr_clf_seasonal = LogisticRegression()
lr_clf_seasonal.fit(X_train_scaled, Y_train[:, 1])

LogisticRegression()

In [30]:
lr_clf_h1n1_pred = lr_clf_h1n1.predict_proba(X_test_scaled)[:, 1]
lr_clf_seasonal_pred = lr_clf_seasonal.predict_proba(X_test_scaled)[:, 1]
lr_pred = np.c_[lr_clf_h1n1_pred, lr_clf_seasonal_pred]
roc_auc_score(Y_test, lr_pred)

0.8550356336880862

ROCAUC score : 0.855

SVC
------

On peut surement largement améliorer la prédiction en utilisant les kernel polynomiaux. Il faudrait faire un tuning du degré des polys pour optimiser les prédictions

In [51]:
svc_clf_h1n1 = SVC(probability=True, kernel="poly", degree=3)
svc_clf_h1n1.fit(X_train_scaled, Y_train[:, 0])

SVC(probability=True)

In [52]:
svc_clf_seasonal = SVC(probability=True, kernel="poly", degree=3)
svc_clf_seasonal.fit(X_train_scaled, Y_train[:, 1])

SVC(probability=True)

In [53]:
svc_clf_h1n1_pred = svc_clf_h1n1.predict_proba(X_test_scaled)[:, 1]
svc_clf_seasonal_pred = svc_clf_seasonal.predict_proba(X_test_scaled)[:, 1]
svc_pred = np.c_[svc_clf_h1n1_pred, svc_clf_seasonal_pred]
roc_auc_score(Y_test, svc_pred)

0.8502236838526532

ROCAUC score : 0.850

DeepLearning
-------

In [41]:
dl_clf = keras.models.load_model("best_model.h5")

In [43]:
dl_clf_pred = dl_clf.predict(X_test_scaled)
roc_auc_score(Y_test, dl_clf_pred)

0.8594474913605683

ROCAUC score : 0.859

AutoFeat
-------

In [13]:
af_clf = joblib.load("autoFeatModel.save")

Il faut transformer les données pour que AutoFeat fonctionne

af_clf_h1n1_pred = af_clf.predict_proba(X_train_scaled)[:, 1]
roc_auc_score(Y_test[:, 0], af_clf_h1n1_pred)

Ensemble
======

Ensemble learning for 'h1n1_vaccine' label
---------

We test ensemble learning just for 'h1n1_vaccine'

Now that all of our models are built (even though I didn't take the time to optimize them with CrossValidation), we can gather them into one algorithm thanks to the power of soft voting.
On peut mettre des pipelines plutot que des algos dans les estimateurs et je crois que le type d'estimateur n'a pas d'importance.

In [13]:
lr_clf_h1n1 = LogisticRegression()
rndf_clf_h1n1 = RandomForestClassifier()
xgb_clf_h1n1 = XGBClassifier()
ada_clf_h1n1 = AdaBoostClassifier()
cb_clf_h1n1 = CatBoostClassifier()
svc_clf_h1n1 = SVC(probability=True, kernel="poly", degree=3)

Soft voting classifier :

In [18]:
vote_clf_h1n1 = VotingClassifier(
    estimators=[
        ('lr', lr_clf_h1n1),
        ('rndf', rndf_clf_h1n1),
        ('xgb', xgb_clf_h1n1),
        ('ada', ada_clf_h1n1),
        ('cb', cb_clf_h1n1),
        ('svc', svc_clf_h1n1),
    ],
    voting="soft",
    verbose=True,
    n_jobs=-1
)
vote_clf_h1n1.fit(X_train_scaled, Y_train[:, 0]) # On entraine le modèle ensembliste sur le label 'h1n1_vaccine'

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rndf', RandomForestClassifier()),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            enable_categorical=False,
                                            gamma=None, gpu_id=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_d...
                                            num_parallel_tree=None,
                                            predictor=None, random_state=None

Résultat sur le set d'entrainnement :

In [19]:
vote_clf_h1n1_pred = vote_clf_h1n1.predict_proba(X_train_scaled)[:, 1]
roc_auc_score(Y_train[:, 0], vote_clf_h1n1_pred)

0.9823358426198993

Résultat sur le set de test (sans optimisation des params) :

In [20]:
vote_clf_h1n1_pred = vote_clf_h1n1.predict_proba(X_test_scaled)[:, 1]
roc_auc_score(Y_test[:, 0], vote_clf_h1n1_pred)

0.8733484130403563

Ensemble learning for 'seasonal_vaccine' label
-----------------

We test ensemble learning just for 'h1n1_vaccine'

In [21]:
lr_clf_seasonal = LogisticRegression()
rndf_clf_seasonal = RandomForestClassifier()
xgb_clf_seasonal = XGBClassifier()
ada_clf_seasonal = AdaBoostClassifier()
cb_clf_seasonal = CatBoostClassifier()
svc_clf_seasonal = SVC(probability=True, kernel="poly", degree=3)

Soft voting classifier :

In [22]:
vote_clf_seasonal = VotingClassifier(
    estimators=[
        ('lr', lr_clf_seasonal),
        ('rndf', rndf_clf_seasonal),
        ('xgb', xgb_clf_seasonal),
        ('ada', ada_clf_seasonal),
        ('cb', cb_clf_seasonal),
        ('svc', svc_clf_seasonal),
    ],
    voting="soft",
    verbose=True,
    n_jobs=-1
)
vote_clf_seasonal.fit(X_train_scaled, Y_train[:, 1]) # On entraine le modèle ensembliste sur le label 'seasonal_vaccine'

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rndf', RandomForestClassifier()),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            enable_categorical=False,
                                            gamma=None, gpu_id=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_d...
                                            num_parallel_tree=None,
                                            predictor=None, random_state=None

Résultat sur le set d'entrainnement :

In [23]:
vote_clf_seasonal_pred = vote_clf_seasonal.predict_proba(X_train_scaled)[:, 1]
roc_auc_score(Y_train[:, 1], vote_clf_seasonal_pred)

0.9681467453726371

Résultat sur le set de test (sans optimisation des params) :

In [24]:
vote_clf_seasonal_pred = vote_clf_seasonal.predict_proba(X_test_scaled)[:, 1]
roc_auc_score(Y_test[:, 1], vote_clf_seasonal_pred)

0.8672901476944609

Merging of the two models
-------------------

In [25]:
pred = np.c_[vote_clf_h1n1_pred, vote_clf_seasonal_pred]
roc_auc_score(Y_test, pred)

0.8703192803674086

Résultat après fusion : 87 % sans optimisation des paramètres de chaque modèle.

Points d'amélioration :
* Optimisation des paramètres de chaque modèle et cross validation.
* Réessayer les modèles avec les paramètres générés par autofeat (pour l'instant seulement pour 'h1n1_vaccine').
* Essayer d'entrainer un modèle remplaçant le soft voting