# H1N1 - Flu Shot Learning

In [1]:
import keras_tuner
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import Input

In [3]:
from sklearn.model_selection import RandomizedSearchCV
from scikeras.wrappers import KerasClassifier
from scipy.stats import reciprocal

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import joblib

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [6]:
import tensorflow as tf
from tensorflow import keras

In [7]:
def numerical_impute(data, numerical_list):
    imputer_numerical = SimpleImputer(strategy='constant', fill_value=-1, missing_values=np.nan)
    data_numerical = data.loc[:, numerical_list]
    data_numerical_imputed = imputer_numerical.fit_transform(data_numerical)
    data_numerical_imputed = pd.DataFrame(data_numerical_imputed, columns=numerical_list)
    return data_numerical_imputed

def categorical_imputing(data, categorical_list):
    # Imputing
    imputer_categorical = SimpleImputer(strategy='constant', fill_value='missing', missing_values=np.nan)
    data_categorical = data.loc[:, categorical_list]
    data_categorical = imputer_categorical.fit_transform(data_categorical)
    data_categorical_imputed = pd.DataFrame(data_categorical, columns=categorical_list)
    return data_categorical_imputed

def categorical_impute_one_hot(data, categorical_list):
    # Imputing
    data_categorical_imputed = categorical_imputing(data, categorical_list)

    # One hot encoding
    data_one_hot = pd.get_dummies(data_categorical_imputed)

    return data_one_hot

def categorical_impute_ordinal(data, categorical_list):
    # Imputing
    data_categorical_imputed = categorical_imputing(data, categorical_list)

    # Ordinal encoding
    ordinal_encoder = OrdinalEncoder()
    data_ordinal = ordinal_encoder.fit_transform(data_categorical_imputed)
    data_ordinal = pd.DataFrame(data_ordinal, columns=categorical_list)

    return data_ordinal

def categorical_impute_encode1(data, categorical_list):
    # Imputing
    data_categorical = categorical_imputing(data, categorical_list)

    # Ordinal encoding
    ordinal_encoder = OrdinalEncoder()
    data_categorical_encoded = ordinal_encoder.fit_transform(data_categorical)
    data_categorical_encoded = pd.DataFrame(data_categorical_encoded, columns=categorical_list)

    return data_categorical_encoded

def categorical_impute_encode2(data, categorical_list_one_hot, categorical_list_ordinal):
    # Imputing
    data_categorical = categorical_imputing(data, categorical_list_ordinal + categorical_list_one_hot)

    # Ordinal encoding
    ordinal_encoder = OrdinalEncoder()
    data_categorical_ordinal = ordinal_encoder.fit_transform(data_categorical.loc[:, categorical_list_ordinal])
    data_categorical_ordinal = pd.DataFrame(data_categorical_ordinal, columns=categorical_list_ordinal)

    # One hot encoding
    one_hot_encoder = OneHotEncoder()
    data_categorical_one_hot = one_hot_encoder.fit_transform(data_categorical.loc[:, categorical_list_one_hot])
    data_categorical_one_hot = pd.DataFrame(data_categorical_one_hot, columns=categorical_list_one_hot)

    data_categorical_encoded = pd.merge(data_categorical_ordinal, data_categorical_one_hot, left_index=True, right_index=True)

    return data_categorical_encoded

def data_clean(data, numerical_list, categorical_list, encoding='one_hot'):
    # Changer les listes de features et les fonctions correspondantes
    if encoding == 'ordinal':
        data_categorical_encoded = categorical_impute_ordinal(data, categorical_list)
    else :
        data_categorical_encoded = categorical_impute_one_hot(data, categorical_list)
    data_numerical_imputed = numerical_impute(data, numerical_list)
    data_imputed_encoded = pd.merge(data_numerical_imputed, data_categorical_encoded, left_index=True, right_index=True)
    
    return data_imputed_encoded

Import des données
===========

In [8]:
FEATURES_TRAINING_PATH = "training_set_features.csv"
LABELS_TRAINING_PATH = "training_set_labels.csv"

features = pd.read_csv(FEATURES_TRAINING_PATH, sep=",", header=0)
labels = pd.read_csv(LABELS_TRAINING_PATH, sep=",", header=0)
data_original = pd.merge(features, labels, on="respondent_id")
respondent_id = data_original.loc[:, ['respondent_id']]
data_original.drop("respondent_id", axis=1, inplace=True)
data = data_original.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26615 non-null  float64
 1   h1n1_knowledge               26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_h1n1             24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker   

Liste des attributs
-------------------------

In [9]:
arg_list = list(data.keys())
features_list = arg_list.copy()
features_list.remove("h1n1_vaccine")
features_list.remove("seasonal_vaccine")

labels_list = ['h1n1_vaccine', 'seasonal_vaccine']

categorical_list = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa','employment_industry', 'employment_occupation']

categorical_list_one_hot = ['race', 'sex', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']

categorical_list_ordinal = [k for k in categorical_list if k not in categorical_list_one_hot]

numerical_list = [k for k in features_list if k not in categorical_list]

On sauvegarde les listes de label

In [7]:
fichier_sauvegarde_listes = 'lists.save'
dic_label = {
    'numerical_list' : numerical_list,
    'categorical_list' : categorical_list,
}
joblib.dump(dic_label, fichier_sauvegarde_listes)

['lists.save']

In [8]:
lists = joblib.load("lists.save")
lists

{'numerical_list': ['h1n1_concern',
  'h1n1_knowledge',
  'behavioral_antiviral_meds',
  'behavioral_avoidance',
  'behavioral_face_mask',
  'behavioral_wash_hands',
  'behavioral_large_gatherings',
  'behavioral_outside_home',
  'behavioral_touch_face',
  'doctor_recc_h1n1',
  'doctor_recc_seasonal',
  'chronic_med_condition',
  'child_under_6_months',
  'health_worker',
  'health_insurance',
  'opinion_h1n1_vacc_effective',
  'opinion_h1n1_risk',
  'opinion_h1n1_sick_from_vacc',
  'opinion_seas_vacc_effective',
  'opinion_seas_risk',
  'opinion_seas_sick_from_vacc',
  'household_adults',
  'household_children'],
 'categorical_list': ['age_group',
  'education',
  'race',
  'sex',
  'income_poverty',
  'marital_status',
  'rent_or_own',
  'employment_status',
  'hhs_geo_region',
  'census_msa',
  'employment_industry',
  'employment_occupation']}

In [12]:
imputer_categorical = SimpleImputer(strategy='constant', fill_value='missing', missing_values=np.nan)
data_categorical = data.loc[:, categorical_list]
data_categorical = imputer_categorical.fit_transform(data_categorical)
data_categorical = pd.DataFrame(data_categorical, columns=categorical_list)

In [13]:
ordinal_encoder = OrdinalEncoder()
data_categorical_encoded = ordinal_encoder.fit_transform(data_categorical)
data_categorical_encoded = pd.DataFrame(data_categorical_encoded, columns=categorical_list)
data_categorical_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age_group              26707 non-null  float64
 1   education              26707 non-null  float64
 2   race                   26707 non-null  float64
 3   sex                    26707 non-null  float64
 4   income_poverty         26707 non-null  float64
 5   marital_status         26707 non-null  float64
 6   rent_or_own            26707 non-null  float64
 7   employment_status      26707 non-null  float64
 8   hhs_geo_region         26707 non-null  float64
 9   census_msa             26707 non-null  float64
 10  employment_industry    26707 non-null  float64
 11  employment_occupation  26707 non-null  float64
dtypes: float64(12)
memory usage: 2.4 MB


In [14]:
imputer_numerical = SimpleImputer(strategy='constant', fill_value=-1, missing_values=np.nan)
data_numerical = data.loc[:, numerical_list]
data_numerical = imputer_numerical.fit_transform(data_numerical)
data_numerical = pd.DataFrame(data_numerical, columns=numerical_list)
data_numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26707 non-null  float64
 1   h1n1_knowledge               26707 non-null  float64
 2   behavioral_antiviral_meds    26707 non-null  float64
 3   behavioral_avoidance         26707 non-null  float64
 4   behavioral_face_mask         26707 non-null  float64
 5   behavioral_wash_hands        26707 non-null  float64
 6   behavioral_large_gatherings  26707 non-null  float64
 7   behavioral_outside_home      26707 non-null  float64
 8   behavioral_touch_face        26707 non-null  float64
 9   doctor_recc_h1n1             26707 non-null  float64
 10  doctor_recc_seasonal         26707 non-null  float64
 11  chronic_med_condition        26707 non-null  float64
 12  child_under_6_months         26707 non-null  float64
 13  health_worker   

In [15]:
data_encoded = pd.merge(data_numerical, data_categorical_encoded, left_index=True, right_index=True)

# Deep learning classification

Modèle Simple
===============

On veut déterminer les probabilités d'appartenance à chaque classe : multilabel classification

In [10]:
data_encoded = data_clean(data, numerical_list, categorical_list, encoding='one_hot')

In [11]:
labels.drop("respondent_id", axis=1, inplace=True)
Y = labels.to_numpy()
X = data_encoded.to_numpy()

shape_train_data = X.shape[1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1, test_size=0.2)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, random_state=1, test_size=0.2)

On scale les données :

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [13]:
X_train_scaled.shape

(17092, 112)

In [14]:
Y_train.shape

(17092, 2)

Sauvegarde du scaler

In [30]:
joblib.dump(scaler, "scaler.save")

['scaler.save']

Modèle séquentiel

In [35]:
n_neurons = 35
dropout = 0.4
n_layers = 3
learning_rate = 3e-3
model_seq = Sequential()
model_seq.add(Input(shape=shape_train_data))
for layer in range(n_layers):
    model_seq.add(Dense(n_neurons, activation="relu"))
    model_seq.add(Dropout(dropout))
model_seq.add(Dense(2, activation="sigmoid"))
optimizer = keras.optimizers.Adam()
model_seq.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['AUC'])

Modèle fonctionnel

In [20]:
# modele assez performant : on peut modifier la structure en ajoutant une voie annexe par ex
input_ = keras.layers.Input(shape=shape_train_data)
hidden1 = keras.layers.Dense(35, activation="relu")(input_)
dropout1 = keras.layers.Dropout(0.5)(hidden1)
hidden2 = keras.layers.Dense(15, activation="relu")(dropout1)
output = keras.layers.Dense(2, activation="sigmoid")(hidden2)
model = keras.Model(inputs=[input_], outputs=[output])

model.compile(loss="binary_crossentropy", optimizer="adam")

On set up un callback de checkpoint et un early stopping :

In [20]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("best_model.h5", save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10)

Entrainnement du modèle séquentiel

In [36]:
history_seq = model_seq.fit(X_train_scaled, Y_train, batch_size=32, epochs=100, validation_data=(X_valid_scaled, Y_valid), callbacks=[checkpoint_cb, early_stopping_cb], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


Entrainnement du modèle fonctionnel

In [50]:
history = model.fit(X_train_scaled, Y_train, epochs=100, validation_data=(X_valid_scaled, Y_valid), callbacks=[checkpoint_cb, early_stopping_cb], verbose=1)

NameError: name 'model' is not defined

On récupère le meilleur modèle

In [102]:
best_model_ever = keras.models.load_model("best_model_ever_8_12_2021.h5")

In [37]:
model = keras.models.load_model("best_model.h5")

In [38]:
Y_pred = model_seq.predict(X_test_scaled)

In [103]:
Y_pred_best = best_model_ever.predict(X_test_scaled)

In [39]:
print("model : ", roc_auc_score(Y_test, Y_pred)) # C'est la mesure utilisée par DrivenData

model :  0.85896695070634


In [None]:
print("best model : ", roc_auc_score(Y_test, Y_pred_best))

Yeahhhh score de 85,97% c'est top ! Objectif : 86.6%
Malheureusement, le score sur driven data est moins important et vaut seulement 84.5%, donc il y a de la marge de progrès !

Points d'amélioration :
* A voir si en enlevant des données manquantes on ne peut pas encore améliorer le score
* Il faudrait essayer le feature engineering
* On peut utiliser un OneHot encoder pour certaines données catégorielles

Fine tuning Neural Network Hyperparameters
=========

Possible to use :
* RandomizedSearchCV from scikit learn with some wrapper
* Keras tuner from tensorflow

Scikeras and RandomizedSearchCV
--------

In [15]:
def build_model(n_hidden=1, hidden_layer_sizes=30, learning_rate=3e-3, input_shape=[35]):
    model = Sequential()
    model.add(Input(input_shape=input_shape))
    for layer in range(n_hidden):
        model.add(Dense(hidden_layer_sizes, activation="relu"))
    model.add(Dense(2, activation="sigmoid"))
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss="binary_cross_entropy", optimizer=optimizer)
    return model

In [34]:
keras_model = KerasClassifier(build_model, epochs=100, callbacks=[early_stopping_cb])

In [38]:
param_distribs = {
    'n_hidden' : [0,1,2,3],
    'hidden_layer_sizes' : list(range(1,100)),
    'optimizer__learning_rate' : [3e-4, 3e-3, 3e-2]
}

In [39]:
rnd_search_cv = RandomizedSearchCV(keras_model, param_distribs, n_iter=10, cv=3, scoring='roc_auc')
rnd_search_cv.fit(X_train_scaled, Y_train, validation_data=(X_valid_scaled, Y_valid))

ValueError: Invalid parameter n_neurons for estimator KerasClassifier.
This issue can likely be resolved by setting this parameter in the KerasClassifier constructor:
`KerasClassifier(n_neurons=41)`
Check the list of available parameters with `estimator.get_params().keys()`

Scikeras un peu chiant à utiliser et pas super bien documenté

Keras Tuner
-------

In [16]:
import keras_tuner as kt
from keras_tuner import HyperModel
from keras_tuner.tuners import Hyperband

In [16]:
class ClassificationHyperModel(HyperModel):
    """ Keras HyperModel applied to the classification model to put the search space together """
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        # Defining the hyperparameters to thune
        nb_layers = hp.Int('n_layers', min_value=1, max_value=15, step=1, default=3)
        nb_neurons = hp.Int('n_neurons', min_value=10, max_value=300, default=30)
        dropout_rate = hp.Float('rate', min_value=0, max_value=0.9, default=0.5, step=0.1)
        learning_rate = hp.Float('learnin_rate', min_value=1e-4, max_value=1e-2, default=1e-3, sampling='LOG')

        # Building the model structure
        model = Sequential()
        model.add(Input(shape=self.input_shape))
        for layer in range(nb_layers):
            model.add(Dense(units=nb_neurons, activation="relu"))
            model.add(Dropout(rate=dropout_rate))
        model.add(Dense(2, activation="sigmoid"))

        model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(learning_rate), metrics=['AUC'])

        return model

hypermodel = ClassificationHyperModel(input_shape=shape_train_data)

In [19]:
tuner = Hyperband(
    hypermodel,
    max_epochs=40,
    objective=kt.Objective('val_auc', direction='max'),
    executions_per_trial=2,
    seed=1,
    directory='Keras-tuning',
    project_name='neuralNetwork'
)

In [20]:
tuner.search(X_train_scaled, Y_train, validation_split=0.2, epochs=40, callbacks=[keras.callbacks.EarlyStopping(patience=10)], verbose=1)

Trial 90 Complete [00h 01m 15s]
val_auc: 0.8617083430290222

Best val_auc So Far: 0.8694462180137634
Total elapsed time: 00h 36m 23s
INFO:tensorflow:Oracle triggered exit


In [21]:
# Show summary of the results
tuner.results_summary()

# Retrieve best model
best_model = tuner.get_best_models(num_models=1)[0]

Results summary
Results in Keras-tuning\neuralNetwork
Showing 10 best trials
Objective(name='val_auc', direction='max')
Trial summary
Hyperparameters:
n_layers: 4
n_neurons: 236
rate: 0.30000000000000004
learnin_rate: 0.00013414267455355165
tuner/epochs: 40
tuner/initial_epoch: 14
tuner/bracket: 1
tuner/round: 1
tuner/trial_id: ebcccdd5539913b002832ab11bc5ee77
Score: 0.8694462180137634
Trial summary
Hyperparameters:
n_layers: 1
n_neurons: 62
rate: 0.2
learnin_rate: 0.003098269932407016
tuner/epochs: 40
tuner/initial_epoch: 14
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 3f2518980de84cf0204b939c8bc779a6
Score: 0.8689286708831787
Trial summary
Hyperparameters:
n_layers: 1
n_neurons: 62
rate: 0.2
learnin_rate: 0.003098269932407016
tuner/epochs: 5
tuner/initial_epoch: 0
tuner/bracket: 2
tuner/round: 0
Score: 0.8688881099224091
Trial summary
Hyperparameters:
n_layers: 2
n_neurons: 178
rate: 0.2
learnin_rate: 0.00041268008323824807
tuner/epochs: 14
tuner/initial_epoch: 5
tuner/bracket: 3


In [23]:
Y_pred = best_model.predict(X_test_scaled)
print(roc_auc_score(Y_test, Y_pred))

0.8570719932365425


In [24]:
best_model.save('model_hypertunned.h5')

Best three models after tunning
-------

In [17]:
def create_model(input_shape, nb_layers, nb_neurons, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Input(shape=input_shape))
    for layer in range(nb_layers):
        model.add(Dense(units=nb_neurons, activation="relu"))
        model.add(Dropout(rate=dropout_rate))
    model.add(Dense(2, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(learning_rate), metrics=['AUC'])

    return model

In [18]:
top_three_parameters = [
    {
        'nb_layers' : 4,
        'nb_neurons' : 236,
        'dropout_rate' : 0.3,
        'learning_rate' : 0.0001341426
    },
    {
        'nb_layers' : 1,
        'nb_neurons' : 62,
        'dropout_rate' : 0.2,
        'learning_rate' : 0.00309826
    },
    {
        'nb_layers' : 2,
        'nb_neurons' : 178,
        'dropout_rate' : 0.2,
        'learning_rate' : 0.000412680
    }
]

In [19]:
best_three_models = []
for parameters in top_three_parameters:
    best_three_models.append(create_model(shape_train_data, parameters['nb_layers'], parameters['nb_neurons'], parameters['dropout_rate'], parameters['learning_rate']))

In [20]:
for model in best_three_models:
    model.fit(X_train_scaled, Y_train, epochs=40, validation_data=(X_valid_scaled, Y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=10)], verbose=0)

In [21]:
for model in best_three_models:
    Y_pred = model.predict(X_test)
    print("ROCAUC score : ", roc_auc_score(Y_test, Y_pred))

ROCAUC score :  0.8045863124923993
ROCAUC score :  0.8223497087478446
ROCAUC score :  0.8019032995921855


Résultats assez décevant, il fallait probablement utiliser plus de données pour commencer ou augmenter l'espace de recherche :

Résultats obtenus pour 36 min de recherche :
* ROCAUC score :  0.8043674994274905
* ROCAUC score :  0.816739745544371
* ROCAUC score :  0.8113774287104558

La recherche s'est terminée par "oracle triggered exit"