<a href="https://colab.research.google.com/github/serivan/mldmlab/blob/master/Optuna%2C_bayesian_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installazioni

In [1]:
!pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/2b/21/d13081805e1e1afc71f5bb743ece324c8bd576237c51b899ecb38a717502/optuna-2.7.0-py3-none-any.whl (293kB)
[K     |████████████████████████████████| 296kB 28.1MB/s 
Collecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/87/11/aea1cacbd4cf8262809c4d6f95dcb3f2802594de1f51c5bd454d69bf15c5/cliff-3.8.0-py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 9.1MB/s 
Collecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/d5/80/ef186e599a57d0e4cb78fc76e0bfc2e6953fa9716b2a5cf2de0117ed8eb5/alembic-1.6.5-py2.py3-none-any.whl (164kB)
[K     |████████████████████████████████| 174kB 31.0MB/s 
Collecting cmaes>=0.8.2
  Downloading https://files.pythonhosted.org/packages/01/1f/43b01223a0366171f474320c6e966c39a11587287f098a5f09809b45e05f/cmaes-0.8.2-py3-none-any.whl
Collecting colorlog
  Downloading https://files.pythonhosted.org/packages/32/e6/e9ddc6fa1104fda7183

# Imports

In [2]:
import optuna
import numpy as np
import pandas as pd
import tensorflow as tf
import joblib

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    make_scorer,
)
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, MaxAbsScaler
from sklearn.utils import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold



from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Rimozione dei warnings

In [3]:
# Remove all warnings
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

# Costanti

In [4]:
LEARNING_RATE = 0.0001
EPOCHS = 200
RANDOM_STATE = 3993
DATA_PATH = "https://raw.githubusercontent.com/serivan/mldmlab/master/Kaggle/KAGGLE21/"
tf.random.set_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
TRAIN_SIZE = 0.8
DROPCOLUMS=["Id","target"]
FOLDS = 3

early_stopping = EarlyStopping(
    monitor="binary_accuracy", 
    verbose=1,
    patience=20,
    mode="max",
    restore_best_weights=True,
)

# Modello di rete neurale
Implementiamo un semplice modello di rete neurale, con una serie di iperparametri:


1.   numero di unità nel primo layer nascosto, viene poi decrementato per ogni successivo.
2.   numero di layer nascosti.
3.   funzione di attivazione dei layer nascosti.
4.   valore del dropout nel primo layer nascosto, viene poi decrementato per ogni successivo.
5.   booleano che indica se la batch normalization sia attiva nei layer nascosti.
6.   booleano che indica se l'activity regularization sia attiva nei layer nascosti.

Utilizziamo **Adam** come ottimizzatore con  un learning rate pari a $10^{-4}$, **softmax** come funzione di attivazione del layer di output e come loss la **binary crossentropy**, affiancata alla **binary accuracy**, come metrica.
Un altra importante implementazione è l'utilizzo del corretto inizializzatore dei pesi rispetto alla funzione di attivazione inserita come iperparametro.

In [5]:
def deep_dense_nn(max_hidden_units: int,
                  hidden_layers: int = 1,
                  hidden_activation: str = 'relu',
                  max_dropout_rate: float = 0.5,
                  batch_norm: bool = False,
                  activity_regularizer: bool = False,
                 ):
    output_units = 2
    output_activation = 'softmax'
    
    # Define the correct kernel initialization for the selected activation function
    act_kinit_dict = {'relu': 'he_uniform', 'selu': 'lecun_normal', 'elu': 'he_uniform', 'swish': 'he_uniform'}
    kernel_intializer = act_kinit_dict[hidden_activation] if hidden_activation in act_kinit_dict else 'glorot_uniform'
    
    # Create the list of layers
    layers = list()
       
    for i in range(1, hidden_layers + 1):        
        # Add dense layer
        layers.append(Dense(units=int(max_hidden_units/i) + output_units,
                                activation=hidden_activation,
                                kernel_initializer=kernel_intializer,
                                activity_regularizer=l2(1e-5) if activity_regularizer else None,
                               ))
        # Add batch normalization if it is setted in params
        if batch_norm:
            layers.append(BatchNormalization())
        # Add Dropout
        layers.append(Dropout(max_dropout_rate/i))
        
    # Extend with last part of the layers
    layers.append(Dense(output_units, activation=output_activation))
    
#     print(layers)
    
    # Create sequential model
    model = Sequential(layers)
    optimizer = Adam(learning_rate=LEARNING_RATE)
    metrics = ["binary_accuracy"]
    # Compile the model
    model.compile(loss = "binary_crossentropy",
                  optimizer=optimizer,
                  metrics=metrics)
    
    #model.summary()
    
    return model

# Caricamento del dataset e preprocessing 
load del dataset e varie analisi dei dati.

In [6]:

url = DATA_PATH + "train.csv"
training_set = pd.read_csv(url)
training_set.dtypes

Id                       int64
st_slope               float64
age                    float64
chest_pain_type        float64
cholesterol            float64
exercise_angina        float64
fasting_blood_sugar    float64
max_heart_rate         float64
oldpeak                float64
pulse                  float64
resting_bp_s           float64
resting_ecg            float64
sex                    float64
synt                   float64
target                   int64
dtype: object

In [7]:
training_set.head()

Unnamed: 0,Id,st_slope,age,chest_pain_type,cholesterol,exercise_angina,fasting_blood_sugar,max_heart_rate,oldpeak,pulse,resting_bp_s,resting_ecg,sex,synt,target
0,1000,1.0,62.0,2.0,213.0,0.0,1.0,141.0,-0.051026,348.0,128.0,2.0,1.0,0.536459,0
1,1001,2.0,72.0,3.0,2.0,0.0,0.0,115.0,1.626599,287.0,159.0,2.0,1.0,0.334897,0
2,1002,2.0,49.0,3.0,183.0,0.0,0.0,156.0,0.968111,391.0,161.0,0.0,0.0,0.720858,1
3,1003,1.0,35.0,2.0,,0.0,0.0,179.0,0.021913,449.0,119.0,2.0,1.0,0.11756,0
4,1004,2.0,51.0,4.0,-4.0,0.0,1.0,104.0,-0.01097,258.0,120.0,0.0,1.0,0.790254,1


In [8]:
number_of_missing_in_cols = training_set.shape[0] - training_set.count()
number_of_missing_in_cols

Id                       0
st_slope                 1
age                      4
chest_pain_type          9
cholesterol            202
exercise_angina          3
fasting_blood_sugar     11
max_heart_rate           7
oldpeak                  8
pulse                    3
resting_bp_s             7
resting_ecg              3
sex                      8
synt                     0
target                   0
dtype: int64

Spezziamo il data set in X e Y.
Poi eseguiamo lo split in traininig e validation, in modo tale da poter aver un riscontro rispetto a quello che stiamo svolgendo.

In [9]:
# creo X e Y eseguendo il drop di id e target
cols_to_drop = ["Id", "target"]
X_train = training_set.drop(columns=cols_to_drop)
# setta la colonna target
y_train = training_set["target"]
y_train_cat = to_categorical(y_train)
X_train = X_train.fillna(np.nan)
X_train = X_train.astype(np.float32)

X_train, X_val, y_train_cat, y_val_cat = train_test_split(
    X_train, y_train_cat, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train_cat,
)
X_train.shape

(1040, 13)

In [10]:
rounded_labels = np.argmax(y_train_cat, axis=1).reshape(-1, 1)
rounded_labels2 = np.argmax(y_val_cat, axis=1).reshape(-1, 1)
rounded_labels.shape

(1040, 1)

# Sbilanciamento delle classi
Possiamo vedere come il dataset risulta non sbilanciato, il numero dei pazienti con un attacco cardiaco è minore rispetto a coloro che risultano invece sani. Qualora si volesse avere la stessa percentuale si potrebbe equilibrare le classi attraverso differenti strategie: downsampling, upsampling oppure l'assegnamento dei pesi alle classi. La strategia scelta per questo esempio è l'ultimo metodo.

In [11]:
print("Campioni totali nel training set: "+str(training_set.shape[0]))
numbers_hearth_attack = training_set[training_set["target"]==1].shape[0]
numbers_patients = training_set[training_set["target"]==0].shape[0]
print("Numero dei pazienti con attacco cardiaco: " +str(numbers_hearth_attack)+", numero dei pazienti sani: "+str(numbers_patients))

Campioni totali nel training set: 1301
Numero dei pazienti con attacco cardiaco: 503, numero dei pazienti sani: 798


In [12]:
# calcolo i pesi derivanti da ogni classe
class_weight = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weight = {0: np.float32(class_weight[0]), 1: np.float32(class_weight[1])}
print("Peso per pazienti sani: " +str(class_weight[0])+", peso per pazienti con attacco di cuore: "+str(class_weight[1]))

Peso per pazienti sani: 0.8151629, peso per pazienti con attacco di cuore: 1.2932405


# Prova di una configurazione
Qualora, non avessimo nessuna strategia di scelta degli iperparametri, sarebbe necessario sceglierli a mano, come nell'esempio successivo. Nella pipeline, nel fit inseriamo il numero di epoche, l'early stopping, un ulteriore regolarizzazione e i pesi relativi alle classi. 

In [13]:
model = deep_dense_nn(max_hidden_units = 300,
                  hidden_layers = 3,
                  hidden_activation = 'relu',
                  max_dropout_rate  = 0.25,
                  batch_norm = True,
                  activity_regularizer = True,
                 )
pipe = Pipeline([
        ('imputer', SimpleImputer()),
        ('scalers', StandardScaler()),
        ('model', model)
        ])
pipeline = pipe.fit(
    X_train,
    y_train_cat,#rounded_labels,
    model__epochs=EPOCHS,
    model__callbacks=[early_stopping],
    model__class_weight=class_weight,
    model__verbose=0, 
)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Restoring model weights from the end of the best epoch.
Epoch 00169: early stopping


In [14]:
predictions = pipeline.predict(X_train)
predictions = np.argmax(predictions.round(), axis=1)
print("Accuracy sul training set: "+str(accuracy_score(rounded_labels, predictions) * 100))
cm_training = confusion_matrix(rounded_labels, predictions)
print("Confusion matrix sul training: ")
print(cm_training)

predictions = pipeline.predict(X_val)
predictions = np.argmax(predictions.round(), axis=1)
print("Accuracy sul validation set: "+str(accuracy_score(rounded_labels2, predictions) * 100))
cm_training = confusion_matrix(rounded_labels2, predictions)
print("Confusion matrix sul validation: ")
print(cm_training)

Accuracy sul training set: 98.75
Confusion matrix sul training: 
[[630   8]
 [  5 397]]
Accuracy sul validation set: 88.88888888888889
Confusion matrix sul validation: 
[[144  16]
 [ 13  88]]


# Optuna, ricerca degli iperparametri tramite ottimizzazione bayesiana
Come già enunciato nelle slides, utilizzeremo Optuna, per sfruttare questo framework dovremo definire una serie di passi necessari:


1.   Definire uno spazio degli iperparametri (tramite un dizionario in cui, per ogni iperparametro, porremo una distribuzione fra quelle offerte sul sito https://optuna.readthedocs.io/en/stable/reference/distributions.html)
2.   Definire una pipeline (la nostra rete neurale con il suo preprocessing)
3.   Definire una funzione di score (la funzione su cui valutiamo la nostra rete neurale)

Per svolgere queste operazioni andiamo a creare una funzione che daremo in pasto all'algoritmo, in cui estrarremo gli iperparametri, alleneremo il modello e calcoleremo lo score, il nostro valore di ritorno.


Una volta definiti i 3 passi necessari per il funzionamento di Optuna andiamo ora a definire l'ottimizzatore:
1.   il numero di prove da effettuare
2.   il grado di parallelizzazione(meglio porre 1)

Una volta definito è sufficiente poi eseguire il train, come quando svolgiamo un semplice modello di rete neurale.
 
 
 

In [15]:
pipeline_list=[]

In [16]:
def objective(trial):
    model = deep_dense_nn(trial.suggest_int('max_hidden_units',100,1000),
                  hidden_layers = trial.suggest_int('hidden_layers',1,10),
                  hidden_activation = trial.suggest_categorical('hidden_activation', ["relu","elu","tanh","swish","selu"]),
                  max_dropout_rate  = trial.suggest_uniform('max_dropout_rate', 0, 0.6),
                  batch_norm = trial.suggest_categorical('batch_norm', ["True","False"]),
                  activity_regularizer = trial.suggest_categorical('activity_regularizer', ["True","False"]),
                 )
        
    pipe = Pipeline([
                ('imputer', SimpleImputer()),
                ('scalers', StandardScaler()),
                ('model', model)
                ])
        
    pipeline = pipe.fit(
            X_train,
            y_train_cat,
            model__epochs=EPOCHS,
            model__callbacks=[early_stopping],
            model__class_weight=class_weight,
            model__batch_size = trial.suggest_categorical("batch_size",[8,16,32,64]),
            model__verbose=0,
        )
    pipeline_list.append(pipeline)
    predictions = pipeline.predict(X_train)
    predictions = np.argmax(predictions.round(), axis=1)
    print("Accuracy sul training set: "+str(accuracy_score(rounded_labels, predictions) * 100))
    cm_training = confusion_matrix(rounded_labels, predictions)
    print("Confusion matrix sul training: ")
    print(cm_training)

    predictions = pipeline.predict(X_val)
    predictions = np.argmax(predictions.round(), axis=1)
    print("Accuracy sul validation set: "+str(accuracy_score(rounded_labels2, predictions) * 100))
    cm_training = confusion_matrix(rounded_labels2, predictions)
    print("Confusion matrix sul validation: ")
    print(cm_training)
    return accuracy_score(rounded_labels2, predictions) * 100
    

In [None]:
study_name = "trial"
study = optuna.create_study(direction = "maximize",
                            load_if_exists = True,
                            study_name=study_name,
                           )
study.optimize(objective, 
               n_trials =10,
               n_jobs = 1,
               gc_after_trial = True )

[32m[I 2021-05-31 11:21:10,910][0m A new study created in memory with name: trial[0m


Restoring model weights from the end of the best epoch.
Epoch 00063: early stopping


[32m[I 2021-05-31 11:22:36,942][0m Trial 0 finished with value: 85.82375478927203 and parameters: {'max_hidden_units': 974, 'hidden_layers': 10, 'hidden_activation': 'tanh', 'max_dropout_rate': 0.541340446343995, 'batch_norm': 'True', 'activity_regularizer': 'False', 'batch_size': 16}. Best is trial 0 with value: 85.82375478927203.[0m


Accuracy sul training set: 87.98076923076923
Confusion matrix sul training: 
[[554  84]
 [ 41 361]]
Accuracy sul validation set: 85.82375478927203
Confusion matrix sul validation: 
[[132  28]
 [  9  92]]
Restoring model weights from the end of the best epoch.
Epoch 00155: early stopping


[32m[I 2021-05-31 11:22:59,296][0m Trial 1 finished with value: 90.42145593869732 and parameters: {'max_hidden_units': 425, 'hidden_layers': 5, 'hidden_activation': 'selu', 'max_dropout_rate': 0.12807082048101154, 'batch_norm': 'True', 'activity_regularizer': 'False', 'batch_size': 64}. Best is trial 1 with value: 90.42145593869732.[0m


Accuracy sul training set: 98.5576923076923
Confusion matrix sul training: 
[[629   9]
 [  6 396]]
Accuracy sul validation set: 90.42145593869732
Confusion matrix sul validation: 
[[147  13]
 [ 12  89]]
Restoring model weights from the end of the best epoch.
Epoch 00130: early stopping


In [None]:
grid_result_PD = pd.DataFrame(study.trials_dataframe())
grid_result_PD =grid_result_PD.sort_values(by=['value'],ascending=False)
grid_result_PD

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_activity_regularizer,params_batch_norm,params_batch_size,params_hidden_activation,params_hidden_layers,params_max_dropout_rate,params_max_hidden_units,state
4,4,90.038314,2021-05-31 08:47:47.105920,2021-05-31 08:48:23.249881,0 days 00:00:36.143961,False,False,64,tanh,4,0.491052,599,COMPLETE
1,1,89.655172,2021-05-31 08:43:33.994571,2021-05-31 08:47:08.188398,0 days 00:03:34.193827,False,True,8,tanh,9,0.329461,905,COMPLETE
3,3,89.655172,2021-05-31 08:47:20.906562,2021-05-31 08:47:46.864657,0 days 00:00:25.958095,False,True,32,elu,6,0.516461,750,COMPLETE
6,6,88.888889,2021-05-31 08:48:37.880496,2021-05-31 08:50:13.774852,0 days 00:01:35.894356,False,True,8,relu,5,0.361602,739,COMPLETE
7,7,86.97318,2021-05-31 08:50:14.058422,2021-05-31 08:51:43.638571,0 days 00:01:29.580149,False,True,8,tanh,8,0.5884,950,COMPLETE
9,9,86.97318,2021-05-31 08:51:54.600674,2021-05-31 08:54:19.905235,0 days 00:02:25.304561,True,False,8,elu,8,0.286986,950,COMPLETE
2,2,86.590038,2021-05-31 08:47:08.407521,2021-05-31 08:47:20.674081,0 days 00:00:12.266560,False,True,32,selu,4,0.482527,142,COMPLETE
0,0,86.206897,2021-05-31 08:42:33.448510,2021-05-31 08:43:33.794257,0 days 00:01:00.345747,False,True,8,relu,10,0.297069,362,COMPLETE
8,8,85.823755,2021-05-31 08:51:43.931114,2021-05-31 08:51:54.302650,0 days 00:00:10.371536,True,False,64,tanh,7,0.509312,271,COMPLETE
5,5,85.057471,2021-05-31 08:48:23.496077,2021-05-31 08:48:37.629547,0 days 00:00:14.133470,False,True,64,elu,9,0.43939,163,COMPLETE


In [None]:
study.best_trial

FrozenTrial(number=4, values=[90.03831417624522], datetime_start=datetime.datetime(2021, 5, 31, 8, 47, 47, 105920), datetime_complete=datetime.datetime(2021, 5, 31, 8, 48, 23, 249881), params={'max_hidden_units': 599, 'hidden_layers': 4, 'hidden_activation': 'tanh', 'max_dropout_rate': 0.4910515199949189, 'batch_norm': 'False', 'activity_regularizer': 'False', 'batch_size': 64}, distributions={'max_hidden_units': IntUniformDistribution(high=1000, low=100, step=1), 'hidden_layers': IntUniformDistribution(high=10, low=1, step=1), 'hidden_activation': CategoricalDistribution(choices=('relu', 'elu', 'tanh', 'swish', 'selu')), 'max_dropout_rate': UniformDistribution(high=0.6, low=0.0), 'batch_norm': CategoricalDistribution(choices=('True', 'False')), 'activity_regularizer': CategoricalDistribution(choices=('True', 'False')), 'batch_size': CategoricalDistribution(choices=(8, 16, 32, 64))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=4, state=TrialState.COMPLETE, value=No

In [None]:
print("Index migliore: " +str(study.best_trial.number))
print("Parametri migliori: "+str(study.best_params))
print("Score migliore: "+str(study.best_value))

Index migliore: 4
Parametri migliori: {'max_hidden_units': 599, 'hidden_layers': 4, 'hidden_activation': 'tanh', 'max_dropout_rate': 0.4910515199949189, 'batch_norm': 'False', 'activity_regularizer': 'False', 'batch_size': 64}
Score migliore: 90.03831417624522


# Salvataggio del CSV e dello studio

In [None]:
#grid_result_PD.to_csv("./dataset_kaggle/grid_result_PD.csv")
#joblib.dump(grid_result.study_, "./dataset_kaggle/study.pkl")

# Visualizzazione dello studio tramite grafici

In [None]:
#study = joblib.load('study.pkl')
#study = grid_result.study_

In [None]:
optuna.visualization.plot_optimization_history(study).show()

In [None]:
optuna.visualization.plot_param_importances(study, params= [
                                                      'batch_size',
                                                      'hidden_activation',
                                                      'hidden_layers',
                                                      'max_dropout_rate',
                                                      'max_hidden_units',
                                                     ]).show()

In [None]:
optuna.visualization.plot_slice(study, params= [
                                                      'batch_size',
                                                      'hidden_activation',
                                                      'hidden_layers',
                                                      'max_dropout_rate',
                                                      'max_hidden_units',
                                                     ]).show()

In [None]:
optuna.visualization.plot_parallel_coordinate(study, params= [
                                                      'batch_size',
                                                      'hidden_activation',
                                                      'hidden_layers',
                                                      'max_dropout_rate',
                                                      'max_hidden_units',
                                                     ]).show()