In [17]:
import sys
sys.path.append('..')

In [18]:
import numpy as np
import pandas as pd
import pickle
import random

from sklearn.preprocessing import MinMaxScaler
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed, RepeatVector
from keras.optimizers import RMSprop, Adam, Adamax, Adagrad

from lib.read_data import read_and_join_output_file
#from lib.create_pipeline import create_transformation_pipeline
from lib.deeplearning import create_transformation_pipelines, evaluate_forecast
from lib.transform_impute import convert_back_df
from lib.split_data import train_test_group_time_split

In [19]:
RANDOM_SEED = 31
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tensorflow.random.set_seed(RANDOM_SEED)

In [20]:
print("Num GPUs Available: ", len(tensorflow.config.list_physical_devices('GPU')))

Num GPUs Available:  0


## Preparing the Dataset
### The Train-Test Split
The dataset is made of 478 Township-Ranges, each containing a multivariate (81 features) time series (data between 2014 to 2021). This dataset can thus be seen as a 3 dimensional dataset of
$478 TownshipRanges * 8 time stamps * 81 features$
The objective is to predict the 2022 target value of GSE_GWE (Ground Surface Elevation to Groundwater Water Elevation - Depth to groundwater elevation in feet below ground surface) for each Township-Range.

LSTM neural networks can be used for time series forecasting and take inputs of the shape *[samples, time series steps, features]*. This perfectly fits our dataset.
To fit our dataset and objective, as well as LSTM neural networks architecture we will thus perform the train test split as follow:
* Training and Test sets will be split by Township-Ranges. I.e., some Township-Ranges will have all their 2014-2021 data points in the training set, some others will be in the the test set.
* The model will be trained based on the 2014-2020 data for all features - including the target feature - and will be trained and tested on the 2021 value of the target feature.

With such a method, unlike a simple time series forecasting where the target feature is forecasted only based on its past value, we allow past value of other features (in our case cultivated crops, precipitations, population density, number of wells drilled) to influence the future value of the target feature.

![Train-Test Split](../doc/images/deeplearning-train-test-split.jpg)
### Data Imputation and Scaling
For neural network we use a MinMax scaler to scale all values between 0 and 1.
We do not need to do any data imputation on the training and test sets *y* target feature since it does not have any missing data point

In [21]:
test_size=0.15
# Load the data from the ETL output files
X = read_and_join_output_file()
# Split the data into a training and a test set
X_train_df, X_test_df, y_train_df, y_test_df = train_test_group_time_split(X, index=["TOWNSHIP_RANGE", "YEAR"], group="TOWNSHIP_RANGE", test_size=test_size, random_seed=RANDOM_SEED)
# Create, fit and apply the data imputation pipeline to the training and test sets
impute_pipeline, columns = create_transformation_pipelines(X_train_df)
X_train_impute = impute_pipeline.fit_transform(X_train_df)
X_test_impute = impute_pipeline.transform(X_test_df)
# Convert the X_train and X_test back to dataframes
X_train_impute_df = pd.DataFrame(X_train_impute, index=X_train_df.index, columns=columns)
X_test_impute_df = pd.DataFrame(X_test_impute, index=X_test_df.index, columns=columns)
X_train_impute_df["GSE_GWE"] = np.sqrt(X_train_impute_df["GSE_GWE"])
X_test_impute_df["GSE_GWE"] = np.sqrt(X_test_impute_df["GSE_GWE"])
# Keep only the GSE_GWE variable as the outcome variable
scaler = MinMaxScaler()
y_train = scaler.fit_transform(y_train_df[["GSE_GWE"]])
#y_train = np.sqrt(y_train)
y_test = scaler.transform(y_test_df[["GSE_GWE"]])
y_train_3d = y_train[..., np.newaxis]
X_train_impute_df

nb_features = len(X_train_impute_df.columns)

In [22]:
# Change the shape of the input array to (number of Township-Ranges, 7 years (2014-2020), the number of features)
X_train = X_train_impute_df.values.reshape(len(X_train_impute_df.index.get_level_values(0).unique()), len(X_train_impute_df.index.get_level_values(1).unique()), X_train_impute_df.shape[1])
X_test = X_test_impute_df.values.reshape(len(X_test_impute_df.index.get_level_values(0).unique()), len(X_test_impute_df.index.get_level_values(1).unique()), X_test_impute_df.shape[1])

In [23]:
print("="*100)
print("Checking the train, validation and test input (X) datasets sizes:")
print(f"Size of the X_train dataset: {X_train.shape}")
#print(f"Size of the X_val dataset: {X_val.shape}")
print(f"Size of the X_test dataset: {X_test.shape}")
print("="*100)
print("Checking the train, validation and test output (y) datasets sizes:")
print(f"Size of the y_train dataset: {y_train.shape}")
#print(f"Size of the y_val dataset: {y_val_df.shape}")
print(f"Size of the y_test dataset: {y_test.shape}")

Checking the train, validation and test input (X) datasets sizes:
Size of the X_train dataset: (406, 7, 81)
Size of the X_test dataset: (72, 7, 81)
Checking the train, validation and test output (y) datasets sizes:
Size of the y_train dataset: (406, 1)
Size of the y_test dataset: (72, 1)


In [1]:
results_df = y_test_df[["GSE_GWE"]].copy()

NameError: name 'y_test_df' is not defined

## Training Different Models
We tried 3 different LSTM models:
* A simple model made of a single *LSTM* layer and an output *Dense* layer
* A model made of a *LSTM* layer followed by a *Dense* and *Dropout* layers before the output layer
* An Encoder-Decoder model made of 2 LSTM models

### The Simple LSTM Model
![Simple LSTM Model](../doc/images/deeplearning-architecture-1.jpg)

### LSTM Model With a Dense Layer
![LSTM Model With Dense Layer](../doc/images/deeplearning-architecture-2.jpg)

### Encoder-Decoder LSTM Model
Encoder-decoder architectures are more common for sequence to sequence learning e.g., when forecast of the next 3 days based on the past months or years data. In our case we only predict data for 1 time step in the feature. The output sequence being of length 1 this architecture might seem superfluous but has been tested anyway. This architecture was inspired by the Encoder-Decoder architecture in this article: *[CNN-LSTM-Based Models for Multiple Parallel Input and Multi-Step Forecast](https://towardsdatascience.com/cnn-lstm-based-models-for-multiple-parallel-input-and-multi-step-forecast-6fe2172f7668)*.

The model is made of
* an encoding *LSTM* layer
* a *RepeatVector* Layer. The role of this layer is simply to repeat the output of the encoding LSTM layers for the number of time steps in the output sequence (in our case 1).
* a decoding *LSTM* layer
* a fully connected *Dense* layer is applied to each time step using the *TimeDistributed* wrapper
* a *Dropout* layer

As such models are made for sequence to sequence learning and forecasting, the output os such a model is different from the previous ones. It has an output of size *[samples, forcasting sequence length, target features]*. In our case the forecasting sequence length and number of target features are both 1.

![Encoder-Decoder LSTM Model](../doc/images/deeplearning-architecture-3.jpg)

## Simple Model
This model is just made of a single LSTM layer

In [27]:
m1_hyper_parameters = {
    "random_seed": RANDOM_SEED,
    "test_size": test_size,
    "validation_split": 0.05,
    "learning_rate": 0.01,
    "batch_size": 32,
    "epochs": 200,
    "lstm_units": 80,
    "lstm_activation": "sigmoid",
    "output_activation": "linear",
    "nb_features": nb_features,
    "optimizer": "Adam"
}

m1_hyper_parameters = {
    "random_seed": RANDOM_SEED,
    "test_size": test_size,
    "validation_split": 0.1,
    "learning_rate": 0.01,
    "batch_size": 32,
    "epochs": 200,
    "lstm_units": 40,
    "lstm_activation": "sigmoid",
    "output_activation": "linear",
    "nb_features": nb_features,
    "optimizer": "Adam"
}

m1_optimizer = {
    "RMSprop": RMSprop(learning_rate=m1_hyper_parameters["learning_rate"]),
    "Adam": Adam(learning_rate=m1_hyper_parameters["learning_rate"]),
    "Adamax": Adamax(learning_rate=m1_hyper_parameters["learning_rate"]),
    "Adagrad": Adagrad(learning_rate=m1_hyper_parameters["learning_rate"])
}

model1 = Sequential()
model1.add(LSTM(m1_hyper_parameters["lstm_units"], activation=m1_hyper_parameters["lstm_activation"], input_shape=(7, nb_features)))
model1.add(Dense(1, activation=m1_hyper_parameters["output_activation"]))
model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 40)                19520     
                                                                 
 dense_2 (Dense)             (None, 1)                 41        
                                                                 
Total params: 19,561
Trainable params: 19,561
Non-trainable params: 0
_________________________________________________________________


In [28]:
model1.compile(loss="mse", optimizer=m1_optimizer[m1_hyper_parameters["optimizer"]], metrics=[keras.metrics.RootMeanSquaredError()])
model1.fit(X_train, y_train,
                     validation_split=m1_hyper_parameters["validation_split"],
                     batch_size=m1_hyper_parameters["batch_size"],
                     epochs=m1_hyper_parameters["epochs"],
                     shuffle=True)
yhat = model1.predict(X_test, verbose=0)
yhat_inverse = scaler.inverse_transform(yhat)
y_test_inverse = scaler.inverse_transform(y_test)
results_df["experiment_1_prediction"] = yhat_inverse
mae, mse, rmse = evaluate_forecast(y_test_inverse, yhat_inverse)

Epoch 1/200


KeyboardInterrupt: 

## Model 2
This model is made of
* a simple or bidirectional LSTM layer
* a Dense unit
* a Dropout Unit

In [None]:
m2_hyper_parameters = {
    "random_seed": RANDOM_SEED,
    "test_size": test_size,
    "validation_split": 0.1,
    "learning_rate": 0.01,
    "batch_size": 64,
    "epochs": 200,
    "lstm_units": 20,
    "lstm_activation": "sigmoid",
    "dense_units": 61,
    "dense_activation": "tanh",
    "dropout": 0.2,
    "output_activation": "linear",
    "nb_features": nb_features,
    "optimizer": "RMSprop"
}

m2_optimizer = {
    "RMSprop": RMSprop(learning_rate=m2_hyper_parameters["learning_rate"]),
    "Adam": Adam(learning_rate=m2_hyper_parameters["learning_rate"]),
    "Adamax": Adamax(learning_rate=m2_hyper_parameters["learning_rate"]),
    "Adagrad": Adagrad(learning_rate=m2_hyper_parameters["learning_rate"])
}

model2 = Sequential()
model2.add(LSTM(m2_hyper_parameters["lstm_units"], activation=m2_hyper_parameters["lstm_activation"], input_shape=(7, nb_features)))
model2.add(Dense(m2_hyper_parameters["dense_units"], activation=m2_hyper_parameters["dense_activation"]))
model2.add(Dropout(m2_hyper_parameters["dropout"]))
model2.add(Dense(1, activation=m2_hyper_parameters["output_activation"]))
model2.summary()

In [None]:
model2.compile(loss="mse", optimizer=m2_optimizer[m2_hyper_parameters["optimizer"]], metrics=[keras.metrics.RootMeanSquaredError()])
model2.fit(X_train, y_train,
                     validation_split=m2_hyper_parameters["validation_split"],
                     batch_size=m2_hyper_parameters["batch_size"],
                     epochs=m2_hyper_parameters["epochs"],
                     shuffle=True)
yhat = model2.predict(X_test, verbose=0)
yhat_inverse = scaler.inverse_transform(yhat)
y_test_inverse = scaler.inverse_transform(y_test)
results_df["experiment_2_prediction"] = yhat_inverse
evaluate_forecast(y_test_inverse, yhat_inverse)
mae, mse, rmse = evaluate_forecast(y_test_inverse, yhat_inverse)

## Model 3 : Encoder-Decoder
This model is made of
* a simple or bidirectional encoding LSTM layer
* a simple or bidirectional decoding LSTM layer
* a Dense unit
* a Dropout Unit

In [None]:
m3_hyper_parameters = {
    "random_seed": RANDOM_SEED,
    "test_size": test_size,
    "validation_split": 0.1,
    "learning_rate": 0.01,
    "batch_size": 64,
    "epochs": 200,
    "lstm_units": 200,
    "2nd_lstm_units": 100,
    "lstm_activation": "sigmoid",
    "dense_units": 81,
    "dense_activation": "tanh",
    "dropout": 0.2,
    "output_activation": "linear",
    "nb_features": nb_features,
    "optimizer": "RMSprop"
}

m3_optimizer = {
    "RMSprop": RMSprop(learning_rate=m3_hyper_parameters["learning_rate"]),
    "Adam": Adam(learning_rate=m3_hyper_parameters["learning_rate"]),
    "Adamax": Adamax(learning_rate=m3_hyper_parameters["learning_rate"]),
    "Adagrad": Adagrad(learning_rate=m3_hyper_parameters["learning_rate"])
}

model3 = Sequential()
model3.add(LSTM(m3_hyper_parameters["lstm_units"], activation=m3_hyper_parameters["lstm_activation"], input_shape=(7, nb_features)))
model3.add(RepeatVector(1))
model3.add(LSTM(m3_hyper_parameters["2nd_lstm_units"], activation=m3_hyper_parameters["lstm_activation"], return_sequences=True))
model3.add(TimeDistributed(Dense(m3_hyper_parameters["dense_units"], activation=m3_hyper_parameters["dense_activation"])))
model3.add(Dropout(m3_hyper_parameters["dropout"]))
model3.add(Dense(1, activation=m3_hyper_parameters["output_activation"]))
model3.summary()

In [None]:
model3.compile(loss="mse", optimizer=m3_optimizer[m3_hyper_parameters["optimizer"]], metrics=[keras.metrics.RootMeanSquaredError()])
model3.fit(X_train, y_train_3d,
                     validation_split=m3_hyper_parameters["validation_split"],
                     batch_size=m3_hyper_parameters["batch_size"],
                     epochs=m3_hyper_parameters["epochs"],
                     shuffle=False)
yhat = model3.predict(X_test, verbose=0)
yhat_inverse = scaler.inverse_transform(yhat.squeeze(2))
y_test_inverse = scaler.inverse_transform(y_test)
results_df["experiment_2_prediction"] = yhat_inverse
evaluate_forecast(y_test_inverse, yhat_inverse)
mae, mse, rmse = evaluate_forecast(y_test_inverse, yhat_inverse)

In [None]:
results_df