# Deep neural network optimization

Imports
========

In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [4]:
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import Input

In [5]:
from sklearn.impute import SimpleImputer
import joblib

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [7]:
# Fonctions de traitement des donnees
def numerical_impute(data, numerical_list):
    imputer_numerical = SimpleImputer(
        strategy='constant', fill_value=-1, missing_values=np.nan)
    data_numerical = data.loc[:, numerical_list]
    data_numerical_imputed = imputer_numerical.fit_transform(data_numerical)
    data_numerical_imputed = pd.DataFrame(
        data_numerical_imputed, columns=numerical_list)
    return data_numerical_imputed


def categorical_imputing(data, categorical_list):
    # Imputing
    imputer_categorical = SimpleImputer(
        strategy='constant', fill_value='missing', missing_values=np.nan)
    data_categorical = data.loc[:, categorical_list]
    data_categorical = imputer_categorical.fit_transform(data_categorical)
    data_categorical_imputed = pd.DataFrame(
        data_categorical, columns=categorical_list)
    return data_categorical_imputed


def categorical_impute_one_hot(data, categorical_list):
    # Imputing
    data_categorical_imputed = categorical_imputing(data, categorical_list)

    # One hot encoding
    data_one_hot = pd.get_dummies(data_categorical_imputed)

    return data_one_hot


def data_clean(data, numerical_list, categorical_list):
    # Changer les listes de features et les fonctions correspondantes
    data_categorical_encoded = categorical_impute_one_hot(
        data, categorical_list)
    data_numerical_imputed = numerical_impute(data, numerical_list)
    data_imputed_encoded = pd.merge(
        data_numerical_imputed, data_categorical_encoded, left_index=True, right_index=True)

    return data_imputed_encoded

Data prep
=========

In [8]:
# Path des données
LABELS_TRAINING_PATH = os.path.join("data", "training_set_labels.csv")
FEATURES_TRAINING_PATH = os.path.join("data", "training_set_features.csv")

# On charge les données
features = pd.read_csv(FEATURES_TRAINING_PATH, sep=",", header=0)
labels = pd.read_csv(LABELS_TRAINING_PATH, sep=",", header=0)
data = pd.merge(features, labels, on="respondent_id")
respondent_id = data['respondent_id']

# Listes de features complètes
arg_list = list(data.keys())
features_list = arg_list.copy()
features_list.remove("h1n1_vaccine")
features_list.remove("seasonal_vaccine")
features_list.remove("respondent_id")

# Différentes listes de features utiles
labels_list = ['h1n1_vaccine', 'seasonal_vaccine']
categorical_list = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own',
                    'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
categorical_list_one_hot = ['race', 'sex', 'marital_status', 'rent_or_own', 'employment_status',
                            'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
categorical_list_ordinal = [
    k for k in categorical_list if k not in categorical_list_one_hot]
numerical_list = [k for k in features_list if k not in categorical_list]

#
labels.drop("respondent_id", inplace=True, axis=1)
Y = labels.to_numpy()
X = data_clean(data, numerical_list, categorical_list).to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=1)

In [9]:
# On scale les données
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled, X_valid_scaled, Y_train, Y_valid = train_test_split(X_train_scaled, Y_train, test_size=0.3, random_state=1)
X_test_scaled = scaler.transform(X_test)

Neural Network
=============

In [25]:
def optimize_network(drops, nbr_layers, lrs, nbr_neurons, X_train_scaled, Y_train, X_test_scaled, Y_test, X_valid_scaled, Y_valid):
    score = 0
    best_params = []
    my_callbacks = []
    my_callbacks.append(keras.callbacks.ModelCheckpoint("checkpoint_model.h5", save_best_only=True))
    my_callbacks.append(keras.callbacks.EarlyStopping(patience=10))
    
    for drop in drops:
        for nbr_layer in nbr_layers:
            for lr in lrs:
                for nbr_neuron in nbr_neurons:
                    # We build the model with the hyperparameters
                    model = Sequential()    
                    model.add(Input(shape=X_train_scaled[0].shape))

                    for i in range(nbr_layer):
                        model.add(Dense(nbr_neuron, activation="relu"))
                        model.add(Dropout(drop))

                    model.add(Dense(2, activation="sigmoid"))
                    optimizer = keras.optimizers.Adam()
                    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['AUC'])
                    model.fit(X_train_scaled, Y_train, epochs=500, validation_data=(X_valid_scaled, Y_valid), batch_size=64, callbacks=my_callbacks)
                    pred = model.predict(X_test_scaled)

                    if score < roc_auc_score(Y_test, pred):
                        score = roc_auc_score(Y_test, pred)
                        best_params = [drop, nbr_layer, lr, nbr_neuron]
    
    return best_params

In [20]:
drop = 0.25
nbr_layers = 4
lr = 1
nbr_neurons = 150

In [21]:
model = Sequential()    
model.add(Input(shape=X_train_scaled[0].shape))
for i in range(nbr_layers):
    model.add(Dense(nbr_neurons, activation="relu"))
    model.add(Dropout(drop))
model.add(Dense(2, activation="sigmoid"))
optimizer = keras.optimizers.Adam()
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['AUC'])


In [22]:
my_callbacks = []
my_callbacks.append(keras.callbacks.ModelCheckpoint("checkpoint_model.h5", save_best_only=True))
my_callbacks.append(keras.callbacks.EarlyStopping(patience=10))

In [None]:
model.fit(X_train_scaled, Y_train, epochs=500, validation_data=(X_valid_scaled, Y_valid), batch_size=64, callbacks=my_callbacks)

In [24]:
pred = model.predict(X_test_scaled)
print(roc_auc_score(Y_test, pred))

0.839896832853844


In [26]:
drops = [0.1, 0.2, 0.3, 0.5]
nbr_layers = [2, 5, 20, 100]
lrs = [1e-2, 1e-1, 0.5, 1]
nbr_neurons = [10, 25, 50, 100]

In [None]:
best_params = optimize_network(drops, nbr_layers, lrs, nbr_neurons, X_train_scaled, Y_train, X_test_scaled, Y_test, X_valid_scaled, Y_valid)

In [None]:
joblib.dump(best_params, "DNN_best_params.save")