In [18]:
import sys
sys.path.append('..')

In [19]:
import numpy as np
import pandas as pd
import pickle

import neptune.new as neptune
from neptune.new.integrations.tensorflow_keras import NeptuneCallback

from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed, Conv1D, MaxPooling1D, Flatten, Bidirectional, Input, Flatten, Activation, Reshape, RepeatVector, Concatenate
from keras.optimizers import RMSprop, Adam, Adamax

from lib.read_data import read_and_join_output_file
from lib.create_pipeline import create_transformation_pipeline
from lib.transform_impute import convert_back_df
from lib.split_data import train_test_group_time_split

In [20]:
print("Num GPUs Available: ", len(tensorflow.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [21]:
# During experiment we can try to use neptune.ai to log all the Tensorflow experiments results
neptune_key = pickle.load(open("./neptune.pkl", "rb"))


## Preparing the Dataset
The train and test sets are split by Township-Ranges, i.e. some Township-Ranges data are either fully in the train or test set.
The target value is the value of that variable for 2021
Thus train/test sets are of shape (number of Township-Ranges, 7 years (2014-2020), the number of features).
The input of 1 data point in the model is of shape (7x81


In [22]:
RANDOM_SEED = 42
# Load the data from the ETL output files
X = read_and_join_output_file()
#X["WELL_COUNT"] = X["WELL_COUNT_PUBLIC"] + X["WELL_COUNT_AGRICULTURE"] + X["WELL_COUNT_DOMESTIC"] + X["WELL_COUNT_INDUSTRIAL"]
#X.drop(columns=["WELL_COUNT_PUBLIC", "WELL_COUNT_AGRICULTURE", "WELL_COUNT_DOMESTIC", "WELL_COUNT_INDUSTRIAL"], inplace=True)
# Split the data into a training and a test set
X_train_df, X_test_df, y_train_df, y_test_df = train_test_group_time_split(X, index=["TOWNSHIP_RANGE", "YEAR"], group="TOWNSHIP_RANGE", random_seed=RANDOM_SEED)
# Create, fit and apply the data imputation pipeline to the training and test sets
impute_pipeline = create_transformation_pipeline(X_train_df, scaler = MinMaxScaler())
X_train_impute = impute_pipeline.fit_transform(X_train_df)
X_test_impute = impute_pipeline.fit_transform(X_test_df)
# Convert the X_train and X_test back to dataframes
X_train_impute_df = convert_back_df(X_train_impute, impute_pipeline, X_train_df)
X_test_impute_df = convert_back_df(X_test_impute, impute_pipeline, X_test_df)
# Keep only the GSE_GWE variable as the outcome variable
scaler = MinMaxScaler()
y_train = scaler.fit_transform(y_train_df[["GSE_GWE"]])
y_test = scaler.transform(y_test_df[["GSE_GWE"]])
X_train_impute_df

Unnamed: 0_level_0,Unnamed: 1_level_0,TOTALDRILLDEPTH_AVG,WELLYIELD_AVG,STATICWATERLEVEL_AVG,TOPOFPERFORATEDINTERVAL_AVG,BOTTOMOFPERFORATEDINTERVAL_AVG,TOTALCOMPLETEDDEPTH_AVG,VEGETATION_BLUE_OAK-GRAY_PINE,VEGETATION_CALIFORNIA_COAST_LIVE_OAK,VEGETATION_CANYON_LIVE_OAK,VEGETATION_HARD_CHAPARRAL,...,POPULATION_DENSITY,PCT_OF_CAPACITY,GROUNDSURFACEELEVATION_AVG,AVERAGE_YEARLY_PRECIPITATION,SHORTAGE_COUNT,GSE_GWE,WELL_COUNT_AGRICULTURE,WELL_COUNT_DOMESTIC,WELL_COUNT_INDUSTRIAL,WELL_COUNT_PUBLIC
TOWNSHIP_RANGE,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
T01N R03E,2014,0.097778,0.018246,0.037145,0.098039,0.111111,0.105856,0.000037,0.000137,0.000000,0.000386,...,0.252900,0.717075,0.023626,0.163573,0.0,0.043005,0.029412,0.041667,0.0,0.0
T01N R03E,2015,0.095238,0.021053,0.025042,0.117647,0.080460,0.079848,0.000037,0.000137,0.000000,0.000386,...,0.252799,0.717075,0.018249,0.217900,0.0,0.050637,0.000000,0.027778,0.0,0.0
T01N R03E,2016,0.114286,0.007916,0.022398,0.152614,0.103768,0.104880,0.000037,0.000137,0.000000,0.000386,...,0.250621,0.717075,0.024153,0.209056,0.0,0.035780,0.029412,0.055556,0.0,0.0
T01N R03E,2017,0.000000,0.013684,0.030885,0.127451,0.082375,0.081749,0.000037,0.000137,0.000000,0.000386,...,0.254669,0.717075,0.023541,0.213645,0.0,0.033202,0.000000,0.027778,0.0,0.0
T01N R03E,2018,0.083873,0.002474,0.034558,0.148257,0.093934,0.107605,0.000037,0.000137,0.000000,0.000386,...,0.256461,0.800728,0.020523,0.181012,0.0,0.030798,0.000000,0.097222,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T32S R30E,2016,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033178,0.000000,0.002023,0.003535,...,0.004469,0.496289,0.139007,0.111564,0.0,0.621728,0.000000,0.000000,0.0,0.0
T32S R30E,2017,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033178,0.000000,0.002023,0.003535,...,0.004457,0.496289,0.139007,0.169284,0.0,0.527907,0.000000,0.000000,0.0,0.0
T32S R30E,2018,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033178,0.000000,0.002023,0.003535,...,0.004474,0.496289,0.139007,0.079747,0.0,0.556283,0.000000,0.000000,0.0,0.0
T32S R30E,2019,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033178,0.000000,0.002023,0.003535,...,0.004491,0.580893,0.139007,0.158678,0.0,0.566892,0.000000,0.000000,0.0,0.0


In [23]:
# Change the shape of the input array to (number of Township-Ranges, 7 years (2014-2020), the number of features)
X_train = X_train_impute_df.values.reshape(len(X_train_impute_df.index.get_level_values(0).unique()), len(X_train_impute_df.index.get_level_values(1).unique()), X_train_impute_df.shape[1])
X_test = X_test_impute_df.values.reshape(len(X_test_impute_df.index.get_level_values(0).unique()), len(X_test_impute_df.index.get_level_values(1).unique()), X_test_impute_df.shape[1])

In [24]:
print("="*100)
print("Checking the train, validation and test input (X) datasets sizes:")
print(f"Size of the X_train dataset: {X_train.shape}")
#print(f"Size of the X_val dataset: {X_val.shape}")
print(f"Size of the X_test dataset: {X_test.shape}")
print("="*100)
print("Checking the train, validation and test output (y) datasets sizes:")
print(f"Size of the y_train dataset: {y_train.shape}")
#print(f"Size of the y_val dataset: {y_val_df.shape}")
print(f"Size of the y_test dataset: {y_test.shape}")

Checking the train, validation and test input (X) datasets sizes:
Size of the X_train dataset: (382, 7, 81)
Size of the X_test dataset: (96, 7, 81)
Checking the train, validation and test output (y) datasets sizes:
Size of the y_train dataset: (382, 1)
Size of the y_test dataset: (96, 1)


In [25]:
results_df = y_test_df[["GSE_GWE"]].copy()

In [26]:
hyper_parameters = {
    "validation_split": 0.05,
    "learning_rate": 0.001,
    "batch_size": 32,
    "epochs": 100,
    "lstm_units": 200,
    "dense_units": 20,
    "dropout": 0.2,
    "output_activation": "linear",
}
nb_features = len(X_train_impute_df.columns)

adam_optimizer = Adam(learning_rate=hyper_parameters["learning_rate"])
rms_optimizer = RMSprop(learning_rate=hyper_parameters["learning_rate"])
adamax_optimizer = Adamax(learning_rate=hyper_parameters["learning_rate"])

In [27]:
def evaluate_forecast(y_test_inverse, yhat_inverse):
    mse_ = keras.metrics.MeanSquaredError()
    mae_ = keras.metrics.MeanAbsoluteError()
    rmse_ = keras.metrics.RootMeanSquaredError()
    mae = mae_(y_test_inverse,yhat_inverse)
    print('mae:', mae)
    mse = mse_(y_test_inverse,yhat_inverse)
    print('mse:', mse)
    rmse = rmse_(y_test_inverse,yhat_inverse)
    print('rmse:', rmse)
    return mae, mse, rmse

In [28]:
model1 = Sequential()
model1.add(LSTM(hyper_parameters["lstm_units"], input_shape=(7, nb_features)))
model1.add(Dense(1, activation=hyper_parameters["output_activation"]))
model1.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 200)               225600    
                                                                 
 dense_3 (Dense)             (None, 1)                 201       
                                                                 
Total params: 225,801
Trainable params: 225,801
Non-trainable params: 0
_________________________________________________________________


In [29]:
# Start experiment
run = neptune.init(
    project="mlienart/milestone2",
    api_token=neptune_key,
    name="Basic Model"
)
neptune_cbk = NeptuneCallback(run=run, base_namespace='metrics')
hyper_parameters["optimizer"] = "RMSprop"
run['hyper-parameters'] = hyper_parameters

model1.compile(loss="mse", optimizer=rms_optimizer)
history = model1.fit(X_train, y_train,
                     validation_split=hyper_parameters["validation_split"],
                     batch_size=hyper_parameters["batch_size"],
                     epochs=hyper_parameters["epochs"],
                     shuffle=True,
                     callbacks=[neptune_cbk])
yhat = model1.predict(X_test, verbose=0)
yhat_inverse = scaler.inverse_transform(yhat)
y_test_inverse = scaler.inverse_transform(y_test)
results_df["experiment_1_prediction"] = yhat_inverse
mae, mse, rmse = evaluate_forecast(y_test_inverse, yhat_inverse)
run["eval/mae"] = mae
run["eval/mse"] = mse
run["eval/rmse"] = rmse
run.stop()

https://app.neptune.ai/mlienart/milestone2/e/MIL06KDIEQ-6
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
mae: tf.Tensor(38.663807, shape=(), dtype=float32)
mse: tf.Tensor(4080.8516, shape=(), dtype=float32)
rmse: tf.Tensor(63.881542, shape=(), dtype=float32)
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 110 operations to synchronize with Neptune. Do not kill this process.
All 110 operations synced, thanks for waiting!
Explore the met

In [30]:
model2 = Sequential()
model2.add(Bidirectional(LSTM(hyper_parameters["lstm_units"]), input_shape=(7, nb_features)))
model2.add(Dense(hyper_parameters["dense_units"], activation="tanh"))
model2.add(Dropout(hyper_parameters["dropout"]))
model2.add(Dense(1, activation=hyper_parameters["output_activation"]))
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_1 (Bidirectio  (None, 400)              451200    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 20)                8020      
                                                                 
 dropout_1 (Dropout)         (None, 20)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 21        
                                                                 
Total params: 459,241
Trainable params: 459,241
Non-trainable params: 0
_________________________________________________________________


In [31]:
run = neptune.init(
    project="mlienart/milestone2",
    api_token=neptune_key,
    name="Advanced Model 1"
)
neptune_cbk = NeptuneCallback(run=run, base_namespace='metrics')
hyper_parameters["optimizer"] = "Adam"
run['hyper-parameters'] = hyper_parameters

model2.compile(loss="mse", optimizer=adam_optimizer)
history = model2.fit(X_train, y_train,
                     validation_split=hyper_parameters["validation_split"],
                     batch_size=hyper_parameters["batch_size"],
                     epochs=hyper_parameters["epochs"],
                     shuffle=True,
                     callbacks=[neptune_cbk])
yhat = model2.predict(X_test, verbose=0)
yhat_inverse = scaler.inverse_transform(yhat)
y_test_inverse = scaler.inverse_transform(y_test)
results_df["experiment_2_prediction"] = yhat_inverse
evaluate_forecast(y_test_inverse, yhat_inverse)
mae, mse, rmse = evaluate_forecast(y_test_inverse, yhat_inverse)
run["eval/mae"] = mae
run["eval/mse"] = mse
run["eval/rmse"] = rmse
run.stop()

https://app.neptune.ai/mlienart/milestone2/e/MIL06KDIEQ-7
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
mae: tf.Tensor(47.703247, shape=(), dtype=float32)
mse: tf.Tensor(4278.1597, shape=(), dtype=float32)
rmse: tf.Tensor(65.40764, shape=(), dtype=float32)
mae: tf.Tensor(47.703247, shape=(), dtype=float32)
mse: tf.Tensor(4278.1597, shape=(), dtype=float32)
rmse: tf.Tensor(65.40764, shape=(), dtype=float32)
Shutting down background jobs, please wait a moment...
Done!
Waitin

In [32]:
run = neptune.init(
    project="mlienart/milestone2",
    api_token=neptune_key,
    name="Advanced Model 1"
)
neptune_cbk = NeptuneCallback(run=run, base_namespace='metrics')
hyper_parameters["optimizer"] = "RMSprop"
run['hyper-parameters'] = hyper_parameters

model2.compile(loss="mse", optimizer=rms_optimizer)
history = model2.fit(X_train, y_train,
                     validation_split=hyper_parameters["validation_split"],
                     batch_size=hyper_parameters["batch_size"],
                     epochs=hyper_parameters["epochs"],
                     shuffle=True,
                     callbacks=[neptune_cbk])
yhat = model2.predict(X_test, verbose=0)
yhat_inverse = scaler.inverse_transform(yhat)
y_test_inverse = scaler.inverse_transform(y_test)
results_df["experiment_3_prediction"] = yhat_inverse
evaluate_forecast(y_test_inverse, yhat_inverse)
mae, mse, rmse = evaluate_forecast(y_test_inverse, yhat_inverse)
run["eval/mae"] = mae
run["eval/mse"] = mse
run["eval/rmse"] = rmse
run.stop()

https://app.neptune.ai/mlienart/milestone2/e/MIL06KDIEQ-8
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
mae: tf.Tensor(37.92641, shape=(), dtype=float32)
mse: tf.Tensor(3045.455, shape=(), dtype=float32)
rmse: tf.Tensor(55.185642, shape=(), dtype=float32)
mae: tf.Tensor(37.92641, shape=(), dtype=float32)
mse: tf.Tensor(3045.455, shape=(), dtype=float32)
rmse: tf.Tensor(55.185642, shape=(), dtype=float32)
Shutting down background jobs, please wait a moment...
Done!
Waiting 

In [33]:
run = neptune.init(
    project="mlienart/milestone2",
    api_token=neptune_key,
    name="Advanced Model 1"
)
neptune_cbk = NeptuneCallback(run=run, base_namespace='metrics')
hyper_parameters["optimizer"] = "Adamx"
run['hyper-parameters'] = hyper_parameters

model2.compile(loss="mse", optimizer=adamax_optimizer)
history = model2.fit(X_train, y_train,
                     validation_split=hyper_parameters["validation_split"],
                     batch_size=hyper_parameters["batch_size"],
                     epochs=hyper_parameters["epochs"],
                     shuffle=True,
                     callbacks=[neptune_cbk])
yhat = model2.predict(X_test, verbose=0)
yhat_inverse = scaler.inverse_transform(yhat)
y_test_inverse = scaler.inverse_transform(y_test)
results_df["experiment_4_prediction"] = yhat_inverse
evaluate_forecast(y_test_inverse, yhat_inverse)
mae, mse, rmse = evaluate_forecast(y_test_inverse, yhat_inverse)
run["eval/mae"] = mae
run["eval/mse"] = mse
run["eval/rmse"] = rmse
run.stop()

https://app.neptune.ai/mlienart/milestone2/e/MIL06KDIEQ-9
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
mae: tf.Tensor(36.350475, shape=(), dtype=float32)
mse: tf.Tensor(4081.327, shape=(), dtype=float32)
rmse: tf.Tensor(63.885265, shape=(), dtype=float32)
mae: tf.Tensor(36.350475, shape=(), dtype=float32)
mse: tf.Tensor(4081.327, shape=(), dtype=float32)
rmse: tf.Tensor(63.885265, shape=(), dtype=float32)
Shutting down background jobs, please wait a moment...
Done!
Waitin

In [34]:
results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,GSE_GWE,experiment_1_prediction,experiment_2_prediction,experiment_3_prediction,experiment_4_prediction
TOWNSHIP_RANGE,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T01N R02E,2021,53.193636,91.465973,72.094315,68.847969,60.577919
T01N R11E,2021,107.955000,130.581757,184.866486,153.414902,168.913544
T01S R03E,2021,24.494538,60.070644,54.500401,34.797050,33.061378
T01S R07E,2021,38.644000,74.647034,64.518814,50.588486,46.458038
T01S R10E,2021,113.651250,134.993210,174.184525,100.749992,138.436981
...,...,...,...,...,...,...
T31S R26E,2021,173.915909,216.072052,245.963211,135.791000,219.470413
T31S R31E,2021,403.900000,362.841461,387.175690,294.881317,353.500610
T32S R22E,2021,160.340000,238.303131,300.588348,196.507721,252.820343
T32S R25E,2021,190.120000,214.127045,222.470657,127.866165,214.638840
