# ACEA WATER ANALYTICS

How-to Multivariate Time series analysis.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd
import numpy as np
!pip install odfpy 
from statsmodels.tsa.stattools import adfuller
import missingno as msno
import pickle 

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from keras import backend as K
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA

from statsmodels.tsa.vector_ar.var_model import VAR

# RIVER

In [None]:
river = pd.read_csv('/kaggle/input/acea-water-prediction/River_Arno.csv') 
river.head()

In [None]:
river.shape

## EDA

Checking amount of na values and removing them.

In [None]:
river.isnull().sum()

In [None]:
msno.bar(river)

In [None]:
river = river.dropna().reset_index()

In [None]:
river.shape

In [None]:
river['Date'] = pd.to_datetime(river['Date'], dayfirst=True)

Checking for non-stationarity in data and removing it.
Non stationary data create worse performing models in time series

In [None]:
for i in range(river.shape[1]):
    if i > 2:
        print(adfuller(river.iloc[:,i])[1])
        if adfuller(river.iloc[:,i])[1] > 0.05:
            print('{} has p value > 0.05'.format(river.columns.values[i]))
#column corrrosponding to temperature has non stationarity that needs to be removed

In [None]:
pd.plotting.autocorrelation_plot(river.iloc[:,-2])

In autocorrelation plot, it can be observed there is a cycly for roughly an year for temp.

In [None]:
river.iloc[:,-2] = river.iloc[:,-2].diff(1).fillna(river.iloc[:,-2].diff(1)[365])

In [None]:
for i in range(river.shape[1]):
    if i > 2:
        print(adfuller(river.iloc[:,i])[1])
        if adfuller(river.iloc[:,i])[1] > 0.05:
            print('{} has p value > 0.05'.format(river.columns.values[i]))
#column corrrosponding to temperature has non stationarity that needs to be removed

In [None]:
correlation = river.corr()
plt.figure(figsize=(20,10), facecolor='w')
sns.heatmap(correlation, xticklabels=correlation.columns, yticklabels=correlation.columns, annot=True)
plt.title("Correlation among the variables", size=15)

## Features with > 0.4 corr

features with low correlation will not be used. As can be observed, certain Features have correlation below 0.4 and other have above 0.4. Only ones with 0.4 and above will be used

## Feature selection methods

following cell contains code for various feature selection methods

In [None]:
#PCA 
def pca(X,Y, col):
    scaler_rain = StandardScaler()
    scaled_feat = scaler_rain.fit_transform(X)
    pca = PCA(n_components=4)
    components = pca.fit_transform(scaled_feat)
    col_name = ['PCA1', 'PCA2', 'PCA3', 'PCA4']
    return [components, Y, col_name]

#TREE FEATURE IMPORTANCE
def tree_feat_imp(X,Y, col):
    estimator = RandomForestRegressor(n_estimators=500, random_state=1)
    estimator.fit(X, X[:,-1])
    id = list(estimator.feature_importances_.argsort()[-5:])
    features = X[:,id]
    names = river.iloc[:,2:].loc[:,list(col)].columns.values
    print('METHOD - random forest feature importance')
    print('important features : {}'.format(names[estimator.feature_importances_.argsort()[:-6:-1]]))
    col_name = names[estimator.feature_importances_.argsort()[:-6:-1]]
    return [features, Y, col_name]

#RFE
def rfe(X, Y, col):
    rfe = RFE(RandomForestRegressor(n_estimators=500, random_state=1), n_features_to_select=5)
    fit = rfe.fit(X, Y)
    names = river.iloc[:,2:].loc[:,list(col)].columns.values
    print('METHOD - rfe')
    col_name = []
    for i in range(fit.support_.shape[0]):
        if fit.support_[i]:
            col_name.append(names[i])
    print(col_name)
    features = river.iloc[:,2:].loc[:,list(col)].loc[:,list(fit.support_)].copy().to_numpy()
    return [features, Y, col_name]

#F REGRESSION
def f_reg(X,Y, col):
    f_test,_ = f_regression(X,Y)
    id = list(abs(f_test).argsort()[:-6:-1])
    features = X[:,id]
    names = river.iloc[:,2:].loc[:,list(col)].columns.values
    print('METHOD - f regression')
    print('important features : {}'.format(names[abs(f_test).argsort()[:-6:-1]]))
    col_name = names[abs(f_test).argsort()[:-6:-1]]
    return [features, Y, col_name]

#MUTUAL INFO REGRESSION
def mutual_info(X,Y, col):
    mi = mutual_info_regression(X,Y)
    id = list(mi.argsort()[:-6:-1])
    features = X[:,id]
    names = river.iloc[:,2:].loc[:,list(col)].columns.values
    print('METHOD - mutual info regression')
    print('important features : {}'.format(names[mi.argsort()[:-6:-1]]))
    col_name = names[mi.argsort()[:-6:-1]]
    return [features, Y, col_name]

#PERMUTATION IMPORTANCE
def pi(X,Y, col):
    estimator = RandomForestRegressor().fit(X,Y)
    result = permutation_importance(estimator, X, Y, n_repeats=10)
    id = list(result.importances_mean.argsort()[:-6:-1])
    features = X[:,id]
    names = river.iloc[:,2:].loc[:,list(col)].columns.values
    print('METHOD - permutation importance')
    print('important features : {}'.format(names[result.importances_mean.argsort()[:-6:-1]]))
    col_name = names[result.importances_mean.argsort()[:-6:-1]]
    return [features, Y, col_name]

In [None]:
def select_features(method=dict()):
    col = (river.iloc[:,1:].corrwith(river['Hydrometry_Nave_di_Rosano'], axis=0) > 0.4)
    features = river.iloc[:,2:].loc[:,list(col)].copy().to_numpy()
    target = river['Hydrometry_Nave_di_Rosano'].copy().to_numpy()
    #following are various feature selection methods
    method['0.4_corr'] = [features,target, river.iloc[:,2:].loc[:,list(col)].columns.values]
    # '0.4_corr' contains features with more than 0.4 correlation with Hydrometry
    method['pca'] = pca(features, target, col)
    #'pca' contains features with 4 pca components, acounting for 90% variance
    method['tree'] = tree_feat_imp(features, target, col)
    #'tree' uses tree based models to select top 5 features
    method['rfe'] = rfe(features, target, col)
    #'rfe' uses recursive feature elimination to select top 5 features
    method['f_reg'] = f_reg(features,target, col)
    #selecting features using f regression
    method['mutual_info'] = mutual_info(features, target, col)
    #selecting features using mutual info
    method['perm_imp'] = pi(features, target, col)
    #selecting features using permutation importance
    return method

In [None]:
feat_sel_method = select_features()

## Models

### VAR

following loop contains training loop for VAR model

In [None]:
def VAR_training(feat_sel_dict):
    
    comparison={}
    for key in feat_sel_dict.keys():
        print('Method - {}'.format(key))
        x , xtest, y, ytest = train_test_split(feat_sel_dict[key][0],feat_sel_dict[key][1]
                                              , test_size=0.5, shuffle=False)

        scaler = StandardScaler()
        x = scaler.fit_transform(x)
        xtest = scaler.transform(xtest)

        xtrain = pd.DataFrame(data=x, columns = feat_sel_dict[key][2])

        model = VAR(xtrain)
        result = model.fit(maxlags=30, ic='aic')
        print('lag - {}'.format(result.k_ar))
        lag = result.k_ar

        model_lag = VAR(xtrain)
        result_lag = model_lag.fit(lag)
        result_lag.summary()

        xtest = pd.DataFrame(data=xtest, columns = feat_sel_dict[key][2])

        pred=[]
        for i in range(ytest.shape[0]-lag):
            val = result_lag.forecast(xtest.values[i:i+lag], steps=1)
            #print(val)
            pred.append(val)

        arr = np.array(pred)
        #arr shape : [no of samples ,1,no. of features]
        arr = np.squeeze(arr, axis=1)

        ypred = scaler.inverse_transform(arr)
        rounded = ypred[:,-1].round(2)

        rmse = tf.keras.metrics.RootMeanSquaredError()
        rmse.update_state(ytest[lag:], rounded)

        print('RMSE - {}'.format(rmse.result().numpy()))

        mae = tf.keras.losses.MeanAbsoluteError()
        mae = mae(ytest[lag:], rounded).numpy()

        print('MAE - {}'.format(mae))
        
        comparison[key] = {'RMSE':rmse.result().numpy(), 'lag':lag,
                          'forecast':pd.DataFrame({'pred': rounded,'test': ytest[lag:]})}
        
    return comparison

In [None]:
var = VAR_training(feat_sel_method)

### LSTM

following cell contains trianing loop for lstm model.

In [None]:
''' model_type - 'feature_corr', 'pca_corr', 'tree_corr', 'rfe_corr', 'f_regression_corr', 
'mutual_info_corr', 'permutation_imp_corr'

This partitions features and targets in train/val/test set
 
'''
def training(X,Y,model_type, save=False):

    x, xtest, y, ytest = train_test_split(X, Y, test_size=0.5,
                                                shuffle=False)

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    xtrain, xval, ytrain, yval = train_test_split(x, x[:,-1], test_size=0.1,
                                                shuffle=False)
    xtest = scaler.transform(xtest)

    models = {}
    models[model_type] = {}
    for lag in range(1, 8):
        train_generator = TimeseriesGenerator(xtrain, ytrain, length=lag, sampling_rate=1, batch_size=32)
        val_generator = TimeseriesGenerator(xval, yval, length=lag, sampling_rate=1, batch_size=32)
        test_generator = TimeseriesGenerator(xtest, ytest, length=lag, sampling_rate=1, batch_size=1)

        model = tf.keras.Sequential()
        model.add(tf.keras.layers.LSTM(64, activation='relu', input_shape=(lag, xtrain.shape[1]), return_sequences=False))
        model.add(tf.keras.layers.Dropout(0.3))
        model.add(tf.keras.layers.Dense(1))

        #model.summary()

        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                        patience=2,
                                                        mode='min'
        )


        model.compile(loss=tf.losses.MeanSquaredError(),
                    optimizer=tf.optimizers.Adam(),
                    metrics=[tf.keras.metrics.RootMeanSquaredError(),
                            tf.metrics.MeanAbsoluteError()]
        )


        history = model.fit(train_generator,
                          epochs=30,
                          validation_data=val_generator,
                          shuffle=False,
                          callbacks=[early_stopping],
                          verbose=0)
        print('------------- EVALUATION - {}-----------------'.format(lag))
        ypred = model.predict(test_generator)
        pred = np.repeat(ypred, xtrain.shape[1], axis=1)
        scaled = scaler.inverse_transform(pred)
        rounded = scaled[:,-1].round(2)

        test = ytest[-(ytest.shape[0]-lag):]

        rmse = tf.keras.metrics.RootMeanSquaredError()
        rmse.update_state(test, rounded)

        print('RMSE = {}'.format(rmse.result().numpy()))

        mae = tf.keras.losses.MeanAbsoluteError()
        mae = mae(test, rounded).numpy()

        print('MAE = {}'.format(mae))
        
        if save == True:

            models[model_type][lag] = {}
            models[model_type][lag]['prediction_metric'] = rmse.result().numpy()
            models[model_type][lag]['history'] = history.history
            models[model_type][lag]['forecast'] = pd.DataFrame({'pred': rounded, 
                                                                'test': test})
            return models

In [None]:
for key in feat_sel_method.keys():
    print('_____'+key+'_______')
    training(feat_sel_method[key][0], feat_sel_method[key][1], key)

In [None]:
lstm = training(feat_sel_method['rfe'][0], feat_sel_method['rfe'][1],
                'rfe', True)

In [None]:
lstm['rfe'][1]['forecast'].plot(y=['test', 'pred'], figsize=(10,5))

In [None]:
var['tree']['forecast'].plot(y=['test', 'pred'], figsize=(10,5))

Given the fact that Model - VAR has lower rmse. Additionally, in graphical representation it can be observed that var predicts peak values better than lstm, var model will be used for forecasting of final test set.

# Final TESTING

training a var model with combined data

In [None]:
def final_VAR_model(feat_sel_dict):
    
    for key in feat_sel_dict.keys():
        print('Method - {}'.format(key))
    
        scaler = StandardScaler()
        X = scaler.fit_transform(feat_sel_dict[key][0])

        xtrain = pd.DataFrame(data=X, columns = feat_sel_dict[key][2])

        model = VAR(xtrain)
        result = model.fit(maxlags=30, ic='aic')
        print('lag - {}'.format(result.k_ar))
        lag = result.k_ar

        model_lag = VAR(xtrain)
        result_lag = model_lag.fit(lag)
        result_lag.summary()
        
        if key == 'tree':
            result_lag.save('river_var_rmse_0.3_lag_3.pkl')
            pickle.dump(scaler, open('river_scaler_var_tree.pkl', 'wb'))#saving scaler to file 
            #scaler name contains details of relevant model and feature selection methods

In [None]:
final_VAR_model(feat_sel_method)

## Future Forecast

In [None]:
#loading model and running it on test set
import statsmodels.api as sm
model = sm.load('var_rmse_0.3_lag_3.pickle') #model path
model.summary()#check if model loaded correctly

# 1. load test set of river dataframe in test
test 

# 2. run cell containing code of different feature selection methods
col = (test.iloc[:,1:].corrwith(test['Hydrometry_Nave_di_Rosano'], axis=0) > 0.4)
features = test.iloc[:,2:].loc[:,list(col)].copy().to_numpy()#selects all columns except index and datatime
target = test['Hydrometry_Nave_di_Rosano'].copy().to_numpy()
X,Y, col_name = tree_feat_imp(features,target)

# 3. scale data using same scaler
scaler = load(open('scaler.pkl', 'rb'))
X = scaler.transform(X)

# 4. preparing input for VAR model
xtest = pd.DataFrame(data=X, columns = col_name)

lag = 3 # lag of saved model

In [None]:
#define how far in future do you want the forcasts for
# EX : for a week in future, steps = 7
steps = 
forecast = model.forecast(xtest.values[i:i+lag], steps=)

ypred = scaler.inverse_transform(forecast)

In [None]:
'''
in case you want to predict data for next day
given the data of selected features for past number of days

lag = number for days

'''
pred=[]
for i in range(ytest.shape[0]-lag):
    val = model.forecast(xtest.values[i:i+lag], steps=1)
    pred.append(val)

arr = np.array(pred)
#arr shape : [no of samples ,1,no. of features]
arr = np.squeeze(arr, axis=1)

ypred = scaler.inverse_transform(arr)

In [None]:
rounded = ypred[:,-1].round(2)

rmse = tf.keras.metrics.RootMeanSquaredError()

rmse.update_state(target[lag:], rounded)

print('RMSE - {}'.format(rmse.result().numpy()))

mae = tf.keras.losses.MeanAbsoluteError()
mae = mae(target[lag:], rounded).numpy()

print('MAE - {}'.format(mae))

# LAKE

In [None]:
lake = pd.read_csv('/kaggle/input/acea-water-prediction/Lake_Bilancino.csv') 
lake.head()

In [None]:
lake.shape

## EDA

In [None]:
lake.isnull().sum()

In [None]:
msno.bar(lake)

In [None]:
lake = lake.dropna().reset_index()

In [None]:
lake.shape

In [None]:
lake.head()

In [None]:
lake['Date'] = pd.to_datetime(lake['Date'], dayfirst=True)

In [None]:
for i in range(lake.shape[1]):
    if i > 2:
        print(adfuller(lake.iloc[:,i])[1])
        if adfuller(lake.iloc[:,i])[1] > 0.05:
            print('-'*20)
            print('{} has p value > 0.05'.format(lake.columns.values[i]))
            print('-'*20)

In [None]:
correlation = lake.corr()
plt.figure(figsize=(20,10), facecolor='w')
sns.heatmap(correlation, xticklabels=correlation.columns, yticklabels=correlation.columns, annot=True)
plt.title("Correlation among the variables", size=15)

In [None]:
#PCA 
def pca(X,Y):
    scaler_rain = StandardScaler()
    scaled_feat = scaler_rain.fit_transform(X)
    pca_var = PCA(n_components=3)
    var = pca_var.fit(scaled_feat)
    pca = PCA(n_components=3)
    components = pca.fit_transform(scaled_feat)
    col_name = ['PCA1', 'PCA2', 'PCA3']
    print('METHOD - PCA')
    print('feature variance = {}'.format(sum(var.explained_variance_ratio_.round(2))))
    return [components, Y, col_name]

#TREE FEATURE IMPORTANCE
def tree_feat_imp(X,Y):
    estimator = RandomForestRegressor(n_estimators=500, random_state=1)
    estimator.fit(X, X[:,-1])
    id = list(estimator.feature_importances_.argsort()[:-4:-1])
    features = X[:,id]
    names = lake.iloc[:,2:].columns.values
    print('METHOD - random forest feature importance')
    print('important features : {}'.format(names[estimator.feature_importances_.argsort()[:-4:-1]]))
    col_name = names[estimator.feature_importances_.argsort()[:-4:-1]]
    return [features, Y, col_name]

#RFE
def rfe(X, Y):
    rfe = RFE(RandomForestRegressor(n_estimators=500, random_state=1), n_features_to_select=3)
    fit = rfe.fit(X, Y)
    names = lake.iloc[:,2:].columns.values
    print('METHOD - rfe')
    col_name = []
    for i in range(fit.support_.shape[0]):
        if fit.support_[i]:
            col_name.append(names[i])
    print(col_name)
    features = lake.iloc[:,2:].loc[:,list(fit.support_)].copy().to_numpy()
    return [features, Y, col_name]

#F REGRESSION
def f_reg(X,Y):
    f_test,_ = f_regression(X,Y)
    id = list(abs(f_test).argsort()[:-4:-1])
    features = X[:,id]
    names = lake.iloc[:,2:].columns.values
    print('METHOD - f regression')
    print('important features : {}'.format(names[abs(f_test).argsort()[:-4:-1]]))
    col_name = names[abs(f_test).argsort()[:-4:-1]]
    return [features, Y, col_name]

#MUTUAL INFO REGRESSION
def mutual_info(X,Y):
    mi = mutual_info_regression(X,Y)
    id = list(mi.argsort()[:-4:-1])
    features = X[:,id]
    names = lake.iloc[:,2:].columns.values
    print('METHOD - mutual info regression')
    print('important features : {}'.format(names[mi.argsort()[:-4:-1]]))
    col_name = names[mi.argsort()[:-4:-1]]
    return [features, Y, col_name]

#PERMUTATION IMPORTANCE
def pi(X,Y):
    estimator = RandomForestRegressor().fit(X,Y)
    result = permutation_importance(estimator, X, Y, n_repeats=10)
    id = list(result.importances_mean.argsort()[:-4:-1])
    features = X[:,id]
    names = lake.iloc[:,2:].columns.values
    print('METHOD - permutation importance')
    print('important features : {}'.format(names[result.importances_mean.argsort()[:-4:-1]]))
    col_name = names[result.importances_mean.argsort()[:-4:-1]]
    return [features, Y, col_name]

In [None]:
lake.columns.values

In [None]:
def select_features(y):
    
    method = {}
    
    features = lake.iloc[:,2:].copy().to_numpy()
    target = lake[y].copy().to_numpy()
    #following are various feature selection methods
    method['all'] = [features,target, lake.iloc[:,2:].columns.values]
    # '0.4_corr' contains features with more than 0.4 correlation with Hydrometry
    #method['pca'] = pca(features, target)
    #'pca' contains features with 4 pca components, acounting for 90% variance
    method['tree'] = tree_feat_imp(features, target)
    #'tree' uses tree based models to select top 5 features
    method['rfe'] = rfe(features, target)
    #'rfe' uses recursive feature elimination to select top 5 features
    method['f_reg'] = f_reg(features,target)
    #selecting features using f regression
    method['mutual_info'] = mutual_info(features, target)
    #selecting features using mutual info
    method['perm_imp'] = pi(features, target)
    #selecting features using permutation importance
    return method

In [None]:
flow = select_features('Flow_Rate')

In [None]:
level = select_features('Lake_Level')

In [None]:
def VAR_training(feat_sel_dict, ycol):
    
    comparison={}

    for key in feat_sel_dict.keys():
        print('Method - {}'.format(key))
        x , xtest, y, ytest = train_test_split(feat_sel_dict[key][0],feat_sel_dict[key][1]
                                              , test_size=0.5, shuffle=False)

        scaler = StandardScaler()
        x = scaler.fit_transform(x)
        xtest = scaler.transform(xtest)

        xtrain = pd.DataFrame(data=x, columns = feat_sel_dict[key][2])

        model = VAR(xtrain)
        result = model.fit(maxlags=30, ic='aic')
        print('lag - {}'.format(result.k_ar))
        lag = result.k_ar

        model_lag = VAR(xtrain)
        result_lag = model_lag.fit(lag)
        result_lag.summary()

        xtest = pd.DataFrame(data=xtest, columns = feat_sel_dict[key][2])
        if ycol == 'Flow_Rate':
            idx = xtest.columns.get_loc('Flow_Rate')
        else:
            idx = xtest.columns.get_loc('Lake_Level')
        
        pred=[]
        for i in range(ytest.shape[0]-lag):
            val = result_lag.forecast(xtest.values[i:i+lag], steps=1)
            #print(val)
            pred.append(val)

        arr = np.array(pred)
        #arr shape : [no of samples ,1,no. of features]
        arr = np.squeeze(arr, axis=1)

        ypred = scaler.inverse_transform(arr)
        rounded = ypred[:,idx].round(2)

        rmse = tf.keras.metrics.RootMeanSquaredError()
        rmse.update_state(ytest[lag:], rounded)

        print('RMSE - {}'.format(rmse.result().numpy()))

        mae = tf.keras.losses.MeanAbsoluteError()
        mae = mae(ytest[lag:], rounded).numpy()

        print('MAE - {}'.format(mae))

In [None]:
var_flow = VAR_training(flow, 'Flow_Rate')

In [None]:
var_level = VAR_training(level, 'Lake_Level')

In [None]:
''' model_type - 'feature_corr', 'pca_corr', 'tree_corr', 'rfe_corr', 'f_regression_corr', 
'mutual_info_corr', 'permutation_imp_corr'

This partitions features and targets in train/val/test set
 
'''
def training(X,Y,model_type,ycol,save=False):

    x, xtest, y, ytest = train_test_split(X, Y, test_size=0.5,
                                                shuffle=False)

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    xtrain, xval, ytrain, yval = train_test_split(x, y, test_size=0.1,
                                                shuffle=False)
    xtest = scaler.transform(xtest)
    
    if ycol == 'Flow_Rate':
        if model_type in ['all','rfe']:
            idx = -1
        else:    
            idx = 1 
    else:
        idx = -2
    
    
    models = {}
    models[model_type] = {}
    for lag in range(1, 30):
        train_generator = TimeseriesGenerator(xtrain, ytrain, length=lag, sampling_rate=1, batch_size=32)
        val_generator = TimeseriesGenerator(xval, yval, length=lag, sampling_rate=1, batch_size=32)
        test_generator = TimeseriesGenerator(xtest, ytest, length=lag, sampling_rate=1, batch_size=1)

        model = tf.keras.Sequential()
        model.add(tf.keras.layers.LSTM(64, activation='relu', input_shape=(lag, xtrain.shape[1]), return_sequences=False))
        model.add(tf.keras.layers.Dropout(0.3))
        model.add(tf.keras.layers.Dense(1))

        #model.summary()

        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                        patience=2,
                                                        mode='min'
        )


        model.compile(loss=tf.losses.MeanSquaredError(),
                    optimizer=tf.optimizers.Adam(),
                    metrics=[tf.keras.metrics.RootMeanSquaredError(),
                            tf.metrics.MeanAbsoluteError()]
        )


        history = model.fit(train_generator,
                          epochs=30,
                          validation_data=val_generator,
                          shuffle=False,
                          callbacks=[early_stopping],
                          verbose=0)
        
        
        print('------------- EVALUATION - {}-----------------'.format(lag))
        ypred = model.predict(test_generator)
        pred = np.repeat(ypred, xtrain.shape[1], axis=1)
        scaled = scaler.inverse_transform(pred)    
        rounded = scaled[:,idx].round(2)

        test = ytest[-(ytest.shape[0]-lag):]

        rmse = tf.keras.metrics.RootMeanSquaredError()
        rmse.update_state(test, rounded)

        print('RMSE = {}'.format(rmse.result().numpy()))

        mae = tf.keras.losses.MeanAbsoluteError()
        mae = mae(test, rounded).numpy()

        print('MAE = {}'.format(mae))
        
        if save == True:

            models[model_type][lag] = {}
            models[model_type][lag]['prediction_metric'] = rmse.result().numpy()
            models[model_type][lag]['history'] = history.history
            models[model_type][lag]['forecast'] = pd.DataFrame({'pred': rounded, 
                                                                'test': test})
            return models

In [None]:
for i in flow.keys():
    print(i)
    print(flow[i][0][:5], flow[i][1][:3], flow[i][2])

In [None]:
for key in ['all', 'tree', 'rfe']:
    print('_____'+key+'_______')
    training(flow[key][0], flow[key][1], key, 'Flow_Rate')

In [None]:
for key in ['all', 'tree', 'rfe']:
    print('_____'+key+'_______')
    training(level[key][0], level[key][1], key, 'Lake_Level')