# Deeplearning LSTM Model Hyperpamaters Tuning

In [1]:
import sys
sys.path.append('..')

In [2]:
import numpy as np
import random
import tensorflow
import keras_tuner

from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from lib.read_data import read_and_join_output_file
from lib.deeplearning import get_train_test_datasets,  get_sets_shapes

In [3]:
RANDOM_SEED = 31
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tensorflow.random.set_seed(RANDOM_SEED)

## Preparing the Dataset
The dataset is prepared as explained in the /ml/deeplearning.ipynb notebook. Please refer to it for more details. As a summary:
* The train and test sets are split by Township-Ranges, i.e. some Township-Ranges data are either fully in the train or test set.
* The target value is the value of that variable for 2021
* Data are imputed using a custom pipeline

The resulting train and test sets are of shape [number of Township-Ranges, 7 years (2014-2020), the number of features].
We do not create a validation dataset as we use Keras internal cross-validation mechanism to shuffle the data points (i.e., the Township-Ranges) and keep some for the validation at each training epoch.

In [4]:
test_size=0.15
target_variable="GSE_GWE"
# Load the data from the ETL output files
X = read_and_join_output_file()
# Split the input pandas Dataframe into training and test datasets, applies the impute pipeline
# transformation and reshapes the datasets to 3D (samples, time, features) numpy arrays
X_train, X_test, y_train, y_test, _, _ = get_train_test_datasets(X, target_variable=target_variable, test_size=test_size, random_seed=RANDOM_SEED)
nb_features = X_train.shape[-1]
get_sets_shapes(X_train, X_test)

Unnamed: 0,nb_items,nb_timestamps,nb_features
training dataset,406,7,82
test dataset,72,7,82


## Hyperparameters Tuning
For each of the 3 LSTM models architectures (from simplest to most complex), we use the Keras BayesianOptimization hyperparameters tuner to estimate the best values for the following hyperparameters:
* the number of units for each *LSTM* or *Dense* unit
* the activation function (*sigmoid*, *tanh*, *relu*) used for all layers, except the output layer which is fixed to a *linear* activation function.
* the learning rate
* the size of the validation dataset
* the batch size
* the number of epochs
## Simple Model Hyper-parameter Tuning
![Simple LSTM Model](../doc/images/deeplearning-architecture-1.jpg)

In [5]:
class Model1(keras_tuner.HyperModel):
    def build(self, hp):
        model = Sequential()
        hp_units = hp.Int("units", min_value=10, max_value=300, step=10)
        hp_activ = hp.Choice("activation", values=["tanh", "sigmoid"])
        model.add(LSTM(units=hp_units, activation=hp_activ, input_shape=(7, nb_features)))
        model.add(Dense(1, activation="linear"))
        hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
        model.compile(loss="mse", optimizer=Adam(learning_rate=hp_learning_rate), metrics=[keras.metrics.RootMeanSquaredError()])
        return model

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            validation_split=hp.Choice("validation_split", values=[0.05, 0.1, 0.15, 0.2]),
            batch_size=hp.Int("batch_size", min_value=32, max_value=192, step=16),
            epochs=hp.Int("epochs", min_value=30, max_value=500, step=5),
            shuffle=True,
            **kwargs,
        )

In [6]:
stop_early = tensorflow.keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error', patience=10, verbose=1)
tuner = keras_tuner.BayesianOptimization(Model1(),
                             objective=keras_tuner.Objective("val_root_mean_squared_error", direction="min"),
                             max_trials=250,
                             beta=3.2,
                             seed=RANDOM_SEED,
                             overwrite=True,
                             directory="keras_tuner",
                             project_name="model1_tuner")
tuner.search(X_train, y_train, callbacks=[stop_early])

Trial 150 Complete [00h 00m 31s]
val_root_mean_squared_error: 0.08564456552267075

Best val_root_mean_squared_error So Far: 0.0542435497045517
Total elapsed time: 01h 10m 42s

Search: Running Trial #151

Value             |Best Value So Far |Hyperparameter
300               |120               |units
tanh              |sigmoid           |activation
0.0001            |0.01              |learning_rate
0.2               |0.05              |validation_split
80                |32                |batch_size
440               |500               |epochs

Epoch 1/440
Epoch 2/440
Epoch 3/440
Epoch 4/440
Epoch 5/440
Epoch 6/440
Epoch 7/440
Epoch 8/440
Epoch 9/440
Epoch 10/440
Epoch 11/440
Epoch 12/440


KeyboardInterrupt: 

### Best Model Hyperparameters

In [None]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search is complete.
validation_split: {best_hps.get('validation_split')}
lstm_units: {best_hps.get('units')}
lstm_activation: {best_hps.get('activation')}
learning_rate: {best_hps.get('learning_rate')}
batch_size: {best_hps.get('batch_size')}
epochs: {best_hps.get('epochs')}
""")

### Hyperparameters Tuning Summary

In [None]:
tuner.results_summary()

## Model2 Hyper-parameter tuning
![LSTM Model With Dense Layer](../doc/images/deeplearning-architecture-2.jpg)

In [10]:
class Model2(keras_tuner.HyperModel):
    def build(self, hp):
        model = Sequential()
        lstm_units = hp.Int("lstm_units", min_value=10, max_value=300, step=10)
        lstm_activ = hp.Choice("lstm_activation", values=["tanh", "sigmoid"])
        model.add(LSTM(units=lstm_units, activation=lstm_activ, input_shape=(7, nb_features)))
        dense_units = hp.Int("dense_units", min_value=11, max_value=101, step=2)
        dense_activation = hp.Choice("dense_activation", values=["relu", "tanh", "sigmoid"])
        model.add(Dense(dense_units, activation=dense_activation))
        hp_dropout = hp.Float("dropout_rate", min_value=0.05, max_value=0.25, step=0.05)
        model.add(Dropout(hp_dropout))
        model.add(Dense(1, activation="linear"))
        hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
        model.compile(loss="mse", optimizer=Adam(learning_rate=hp_learning_rate), metrics=[keras.metrics.RootMeanSquaredError()])
        return model

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            validation_split=hp.Choice("validation_split", values=[0.05, 0.1, 0.15, 0.2]),
            batch_size=hp.Int("batch_size", min_value=32, max_value=192, step=16),
            epochs=hp.Int("epochs", min_value=30, max_value=500, step=5),
            shuffle=True,
            **kwargs,
        )

In [11]:
stop_early = tensorflow.keras.callbacks.EarlyStopping(monitor="val_root_mean_squared_error", patience=10, verbose=1)
tuner = keras_tuner.BayesianOptimization(Model2(),
                              objective=keras_tuner.Objective("val_root_mean_squared_error", direction="min"),
                              max_trials=400,
                              beta=3.2,
                              seed=RANDOM_SEED,
                              overwrite=True,
                              directory="keras_tuner",
                              project_name="model2_tuner")
tuner.search(X_train, y_train, callbacks=[stop_early])

Trial 400 Complete [00h 00m 06s]
val_root_mean_squared_error: 0.08002614974975586

Best val_root_mean_squared_error So Far: 0.06468775868415833
Total elapsed time: 03h 14m 13s
INFO:tensorflow:Oracle triggered exit


### Best Model Hyperparameters

In [12]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search is complete.
validation_split: {best_hps.get('validation_split')}
lstm_units: {best_hps.get('lstm_units')}
lstm_activation: {best_hps.get('lstm_activation')}
dense_units: {best_hps.get('dense_units')}
dense_activation: {best_hps.get('dense_activation')}
dropout_rate: {best_hps.get('dropout_rate')}
learning_rate: {best_hps.get('learning_rate')}
batch_size: {best_hps.get('batch_size')}
epochs: {best_hps.get('epochs')}
""")


The hyperparameter search is complete.
validation_split: 0.05
lstm_units: 10
lstm_activation: sigmoid
dense_units: 11
dense_activation: relu
dropout_rate: 0.05
learning_rate: 0.01
batch_size: 32
epochs: 500



### Hyperparameters Tuning Summary

In [13]:
tuner.results_summary()

Results summary
Results in keras_tuner\model2_tuner
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x00000220D8130B50>
Trial summary
Hyperparameters:
lstm_units: 10
lstm_activation: sigmoid
dense_units: 11
dense_activation: relu
dropout_rate: 0.05
learning_rate: 0.01
validation_split: 0.05
batch_size: 32
epochs: 500
Score: 0.06468775868415833
Trial summary
Hyperparameters:
lstm_units: 10
lstm_activation: sigmoid
dense_units: 11
dense_activation: relu
dropout_rate: 0.1
learning_rate: 0.01
validation_split: 0.05
batch_size: 32
epochs: 500
Score: 0.06481722742319107
Trial summary
Hyperparameters:
lstm_units: 10
lstm_activation: sigmoid
dense_units: 11
dense_activation: relu
dropout_rate: 0.15000000000000002
learning_rate: 0.01
validation_split: 0.05
batch_size: 32
epochs: 415
Score: 0.06549284607172012
Trial summary
Hyperparameters:
lstm_units: 10
lstm_activation: sigmoid
dense_units: 11
dense_activation: relu
dropout_rate: 0.15000000000000002
learning_rate: 0.01

## Model3 Hyper-parameter tuning
![Encoder-Decoder LSTM Model](../doc/images/deeplearning-architecture-3.jpg)

In [7]:
class Model3(keras_tuner.HyperModel):
    def build(self, hp):
        model = Sequential()
        lstm_units = hp.Int("lstm_units", min_value=10, max_value=300, step=10)
        lstm_activ = hp.Choice("lstm_activation", values=["tanh", "sigmoid"])
        model.add(LSTM(units=lstm_units, activation="sigmoid", input_shape=(7, nb_features)))
        model.add(RepeatVector(1))
        lstm_units_2 = hp.Int("2nd_lstm_units", min_value=10, max_value=300, step=10)
        lstm_activ_2 = hp.Choice("2nd_lstm_activation", values=["tanh", "sigmoid"])
        model.add(LSTM(units=lstm_units_2, activation="sigmoid", return_sequences=True))
        dense_units = hp.Int("dense_units", min_value=11, max_value=101, step=2)
        dense_activation = hp.Choice("dense_activation", values=["relu", "tanh", "sigmoid"])
        model.add(TimeDistributed(Dense(dense_units, activation=dense_activation)))
        hp_dropout = hp.Float("dropout_rate", min_value=0.05, max_value=0.25, step=0.05)
        model.add(Dropout(hp_dropout))
        model.add(Dense(1, activation="linear"))
        hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
        model.compile(loss="mse", optimizer=Adam(learning_rate=hp_learning_rate), metrics=[keras.metrics.RootMeanSquaredError()])
        return model

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            validation_split=hp.Choice("validation_split", values=[0.05, 0.1, 0.15, 0.2]),
            batch_size=hp.Int("batch_size", min_value=32, max_value=192, step=16),
            epochs=hp.Int("epochs", min_value=30, max_value=500, step=5),
            shuffle=True,
            **kwargs,
        )

In [8]:
stop_early = tensorflow.keras.callbacks.EarlyStopping(monitor="val_root_mean_squared_error", patience=10, verbose=1)
tuner = keras_tuner.BayesianOptimization(Model3(),
                              objective=keras_tuner.Objective("val_root_mean_squared_error", direction="min"),
                              max_trials=400,
                              beta=3.2,
                              seed=RANDOM_SEED,
                              overwrite=True,
                              directory="keras_tuner",
                              project_name="model3_tuner")
tuner.search(X_train, y_train, callbacks=[stop_early])

Trial 500 Complete [00h 00m 18s]
val_root_mean_squared_error: 0.1150362491607666

Best val_root_mean_squared_error So Far: 0.06309118866920471
Total elapsed time: 07h 29m 36s
INFO:tensorflow:Oracle triggered exit


### Best Model Hyperparameters

In [9]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search is complete.
validation_split: {best_hps.get('validation_split')}
lstm_units: {best_hps.get('lstm_units')}
lstm_activation: {best_hps.get('lstm_activation')}
2nd_lstm_units: {best_hps.get('2nd_lstm_units')}
2nd_lstm_activation: {best_hps.get('2nd_lstm_activation')}
dense_units: {best_hps.get('dense_units')}
dense_activation: {best_hps.get('dense_activation')}
dropout_rate: {best_hps.get('dropout_rate')}
learning_rate: {best_hps.get('learning_rate')}
batch_size: {best_hps.get('batch_size')}
epochs: {best_hps.get('epochs')}
""")


The hyperparameter search is complete.
validation_split: 0.05
lstm_units: 300
lstm_activation: sigmoid
2nd_lstm_units: 300
2nd_lstm_activation: tanh
dense_units: 11
dense_activation: sigmoid
dropout_rate: 0.05
learning_rate: 0.01
batch_size: 32
epochs: 30



### Hyperparameters Tuning Summary

In [10]:
tuner.results_summary()

Results summary
Results in keras_tuner\model3_tuner
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x000002575F038250>
Trial summary
Hyperparameters:
lstm_units: 300
lstm_activation: sigmoid
2nd_lstm_units: 300
2nd_lstm_activation: tanh
dense_units: 11
dense_activation: sigmoid
dropout_rate: 0.05
learning_rate: 0.01
validation_split: 0.05
batch_size: 32
epochs: 30
Score: 0.06309118866920471
Trial summary
Hyperparameters:
lstm_units: 300
lstm_activation: sigmoid
2nd_lstm_units: 300
2nd_lstm_activation: tanh
dense_units: 11
dense_activation: sigmoid
dropout_rate: 0.05
learning_rate: 0.01
validation_split: 0.05
batch_size: 32
epochs: 30
Score: 0.06499446928501129
Trial summary
Hyperparameters:
lstm_units: 300
lstm_activation: sigmoid
2nd_lstm_units: 300
2nd_lstm_activation: sigmoid
dense_units: 11
dense_activation: sigmoid
dropout_rate: 0.05
learning_rate: 0.01
validation_split: 0.05
batch_size: 32
epochs: 30
Score: 0.06594016402959824
Trial summary
Hyperparamete