In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing all the required libraries**

In [None]:
!pip install minepy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd
from pandas import DataFrame
%matplotlib inline
import missingno as msno
from datetime import datetime
from numpy.random import multivariate_normal as mvnrnd
from scipy.stats import wishart
from scipy.stats import invwishart
from numpy.linalg import inv as inv
import scipy.io
import time
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from minepy import MINE
import torch
from torch import nn
from torch.autograd import Variable
from sklearn.linear_model import RidgeCV
from math import sqrt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from pykalman import KalmanFilter
from sklearn.ensemble import RandomForestRegressor
import random

# Data import

We use the data of Aquifer_Auser as an example to demonstrate our model.

In [None]:
df = pd.read_csv("/kaggle/input/acea-water-prediction/Aquifer_Auser.csv")
df['date'] = df['Date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y"))
target_variable =['Depth_to_Groundwater_SAL', 'Depth_to_Groundwater_CoS', 'Depth_to_Groundwater_LT2']
columns_name = df.columns.values.tolist()
rain_list = [ a for a in columns_name if a.startswith('Rain')]
Temp_list = [ a for a in columns_name if a.startswith('Temperature')]
Depth_list = [ a for a in columns_name if a.startswith('Depth')]
n_row = df.shape[0]
msno.matrix(df)

In [None]:
rain_list+Temp_list

from the figure above, we can see that there are a huge number of missing values, and the density of these values is quite large. For the variables that will be used as predicted values, many sparse missing values are scattered in them. We decided to remove the rows with large density of missing values first, and then fill in the sparse missing values.

We found that there are some continuous zero values in the temperature variable and depth variable, which is quite abnormal. To prevent these zero values from being caused by measurement errors, we turn them into missing values.

In [None]:
for i in range(len(Depth_list)):
        df[df[[Depth_list[i]]]==0]=np.nan
    
for i in range(len(Temp_list)):
        df[df[[Temp_list[i]]]==0]=np.nan

We created some graphs to observe the missing value distribution.

In [None]:
#missing value in rainfall variables 
for i in range(len(rain_list)):
    nullnum = df[['Date',rain_list[i]]].isnull().sum(axis=1).to_numpy()
    plt.plot(np.array(list(range(n_row))),nullnum,label=rain_list[i])
plt.legend()
plt.show()

In [None]:
#missing value in Depth variables
for i in range(len(Depth_list)):
    nullnum = df[['Date',Depth_list[i]]].isnull().sum(axis=1).to_numpy()
    plt.plot(np.array(list(range(n_row))),nullnum,label=Depth_list[i])
plt.legend()
plt.show()

In [None]:
#missing value in temperature variables
for i in range(len(Temp_list)):
    nullnum = df[['Date',Temp_list[i]]].isnull().sum(axis=1).to_numpy()
    plt.plot(np.array(list(range(n_row))),nullnum,label=Temp_list[i])
plt.legend()
plt.show()

In [None]:
#according to the four graphs above, we select all the data between the 4685th row and the 7000th row.
dfsecond = df[4685:7000]
msno.matrix(dfsecond)

# Feature Engineering

Rainfall variables and temperature variables are important independent variables in this dataset, 
but their influence on the Depth_to_Groundwater variable may lag behind. So we need to find how many days will it take for their impact on the target variables to be reflected in the value. 
Take 'Depth_to_Groundwater_CoS' as excample, shift the data in this variable 31 times and create a variable after each shift.

In [None]:
dfTestLag = dfsecond[rain_list+Temp_list]
for i in range(0,31):
    dfTestLag['CoS'+str(i)] = dfsecond['Depth_to_Groundwater_CoS'].shift(-1*i)

targetlistCoS = ['CoS0','CoS1','CoS2', 'CoS3', 'CoS4', 'CoS5', 'CoS6', 'CoS7', 'CoS8', 'CoS9', 'CoS10', 'CoS11',
                 'CoS12', 'CoS13', 'CoS14', 'CoS15', 'CoS16', 'CoS17', 'CoS18', 'CoS19', 'CoS20', 'CoS21', 'CoS22', 
                 'CoS23', 'CoS24', 'CoS25', 'CoS26', 'CoS27','CoS28','CoS29','CoS30']

We use ridgeCV regression to fit the independent variables to 31 new variables to create 31 models,
then we test the rmse of each model, draw a plot, and choose the point with the smallest rmse value 
to determine the lag value.

In [None]:
dfTestLag = dfTestLag.ffill().bfill()
CoSList =[]

for i in range(len(targetlistCoS)):
    YCoS =dfTestLag [targetlistCoS[i]]
    X = dfTestLag[rain_list+Temp_list]
    X_train1,X_test1,y_train1,y_test1 = train_test_split(X,YCoS,test_size=0.2,random_state=i)
    ridgecv = RidgeCV(alphas=[0.01, 0.1, 0.5, 1, 5, 7, 10, 30,100, 200])
    model = ridgecv.fit(X_train1, y_train1)
    y_pred1 = model.predict(X_test1)
    rms1 = sqrt(mean_squared_error(y_test1, y_pred1))
    CoSList.append(rms1)

dfLT2 = pd.DataFrame(CoSList,columns=['rms'])
plt.title("CoS lag")
plt.ylabel("RMS")
plt.xlabel('lag')
plt.plot(dfLT2['rms'])

We used the same method to test the lag value of other target variables, then we chose '28' as the lag value of target variables.

Bayesian Temporal Matrix Factorization (BTMF)

We chose BTMF method to fill in the missing values of Depth_to_Groundwater variables. The BTMF method has better performance in filling the missing values of long-term time series data sets.

In [None]:
dfmeasure = dfsecond[['Depth_to_Groundwater_LT2',
 'Depth_to_Groundwater_SAL',
 'Depth_to_Groundwater_PAG',
 'Depth_to_Groundwater_CoS',
 'Depth_to_Groundwater_DIEC']]
dfdens = dfmeasure[['Depth_to_Groundwater_LT2',
 'Depth_to_Groundwater_SAL',
 'Depth_to_Groundwater_PAG',
 'Depth_to_Groundwater_CoS',
 'Depth_to_Groundwater_DIEC']]
dfdens['Depth_to_Groundwater_LT2'] = dfdens['Depth_to_Groundwater_LT2'].interpolate()
dfdens['Depth_to_Groundwater_SAL'] = dfdens['Depth_to_Groundwater_SAL'].interpolate()
dfdens['Depth_to_Groundwater_PAG'] = dfdens['Depth_to_Groundwater_PAG'].interpolate()
dfdens['Depth_to_Groundwater_CoS'] = dfdens['Depth_to_Groundwater_CoS'].interpolate()
dfdens['Depth_to_Groundwater_DIEC'] = dfdens['Depth_to_Groundwater_DIEC'].interpolate()
dfdens = dfdens.ffill().bfill()
dfdens = np.delete(dfdens.to_numpy().T,range(len(dfsecond)-(len(dfsecond)//28)*28),axis = 1)
dfdealMis = np.delete(dfmeasure.fillna(0).to_numpy().T,range(len(dfsecond)-(len(dfsecond)//28)*28),axis = 1)

def kr_prod(a, b):
    return np.einsum('ir, jr -> ijr', a, b).reshape(a.shape[0] * b.shape[0], -1)

def cov_mat(mat):
    dim1, dim2 = mat.shape
    new_mat = np.zeros((dim2, dim2))
    mat_bar = np.mean(mat, axis = 0)
    for i in range(dim1):
        new_mat += np.einsum('i, j -> ij', mat[i, :] - mat_bar, mat[i, :] - mat_bar)
    return new_mat

def ten2mat(tensor, mode):
    return np.reshape(np.moveaxis(tensor, mode, 0), (tensor.shape[mode], -1), order = 'F')

def mat2ten(mat, tensor_size, mode):
    index = list()
    index.append(mode)
    for i in range(tensor_size.shape[0]):
        if i != mode:
            index.append(i)
    return np.moveaxis(np.reshape(mat, list(tensor_size[index]), order = 'F'), 0, mode)

def mnrnd(M, U, V):
    """
    Generate matrix normal distributed random matrix.
    M is a m-by-n matrix, U is a m-by-m matrix, and V is a n-by-n matrix.
    """
    dim1, dim2 = M.shape
    X0 = np.random.rand(dim1, dim2)
    P = np.linalg.cholesky(U)
    Q = np.linalg.cholesky(V)
    return M + np.matmul(np.matmul(P, X0), Q.T)

def BTMF(dense_mat, sparse_mat, init, rank, time_lags, maxiter1, maxiter2):
    """Bayesian Temporal Matrix Factorization, BTMF."""
    W = init["W"]
    X = init["X"]
    
    d = time_lags.shape[0]
    dim1, dim2 = sparse_mat.shape
    pos = np.where((dense_mat != 0) & (sparse_mat == 0))
    position = np.where(sparse_mat != 0)
    binary_mat = np.zeros((dim1, dim2))
    binary_mat[position] = 1
    
    beta0 = 1
    nu0 = rank
    mu0 = np.zeros((rank))
    W0 = np.eye(rank)
    tau = 1
    alpha = 1e-6
    beta = 1e-6
    S0 = np.eye(rank)
    Psi0 = np.eye(rank * d)
    M0 = np.zeros((rank * d, rank))
    
    W_plus = np.zeros((dim1, rank))
    X_plus = np.zeros((dim2, rank))
    X_new_plus = np.zeros((dim2 + 1, rank))
    A_plus = np.zeros((rank, rank, d))
    mat_hat_plus = np.zeros((dim1, dim2 + 1))
    for iters in range(maxiter1):
        W_bar = np.mean(W, axis = 0)
        var_mu_hyper = (dim1 * W_bar)/(dim1 + beta0)
        var_W_hyper = inv(inv(W0) + cov_mat(W) + dim1 * beta0/(dim1 + beta0) * np.outer(W_bar, W_bar))
        var_Lambda_hyper = wishart(df = dim1 + nu0, scale = var_W_hyper, seed = None).rvs()
        var_mu_hyper = mvnrnd(var_mu_hyper, inv((dim1 + beta0) * var_Lambda_hyper))
        
        var1 = X.T
        var2 = kr_prod(var1, var1)
        var3 = tau * np.matmul(var2, binary_mat.T).reshape([rank, rank, dim1]) + np.dstack([var_Lambda_hyper] * dim1)
        var4 = (tau * np.matmul(var1, sparse_mat.T)
                + np.dstack([np.matmul(var_Lambda_hyper, var_mu_hyper)] * dim1)[0, :, :])
        for i in range(dim1):
            inv_var_Lambda = inv(var3[:, :, i])
            W[i, :] = mvnrnd(np.matmul(inv_var_Lambda, var4[:, i]), inv_var_Lambda)
        if iters + 1 > maxiter1 - maxiter2:
            W_plus += W
        
        Z_mat = X[np.max(time_lags) : dim2, :]
        Q_mat = np.zeros((dim2 - np.max(time_lags), rank * d))
        for t in range(np.max(time_lags), dim2):
            Q_mat[t - np.max(time_lags), :] = X[t - time_lags, :].reshape([rank * d])
        var_Psi = inv(inv(Psi0) + np.matmul(Q_mat.T, Q_mat))
        var_M = np.matmul(var_Psi, np.matmul(inv(Psi0), M0) + np.matmul(Q_mat.T, Z_mat))
        var_S = (S0 + np.matmul(Z_mat.T, Z_mat) + np.matmul(np.matmul(M0.T, inv(Psi0)), M0) 
                 - np.matmul(np.matmul(var_M.T, inv(var_Psi)), var_M))
        Sigma = invwishart(df = nu0 + dim2 - np.max(time_lags), scale = var_S, seed = None).rvs()
        A = mat2ten(mnrnd(var_M, var_Psi, Sigma).T, np.array([rank, rank, d]), 0)
        if iters + 1 > maxiter1 - maxiter2:
            A_plus += A

        Lambda_x = inv(Sigma)
        var1 = W.T
        var2 = kr_prod(var1, var1)
        var3 = tau * np.matmul(var2, binary_mat).reshape([rank, rank, dim2]) + np.dstack([Lambda_x] * dim2)
        var4 = tau * np.matmul(var1, sparse_mat)
        for t in range(dim2):
            Mt = np.zeros((rank, rank))
            Nt = np.zeros(rank)
            if t < np.max(time_lags):
                Qt = np.zeros(rank)
            else:
                Qt = np.matmul(Lambda_x, np.matmul(ten2mat(A, 0), X[t - time_lags, :].reshape([rank * d])))
            if t < dim2 - np.min(time_lags):
                if t >= np.max(time_lags) and t < dim2 - np.max(time_lags):
                    index = list(range(0, d))
                else:
                    index = list(np.where((t + time_lags >= np.max(time_lags)) & (t + time_lags < dim2)))[0]
                for k in index:
                    Ak = A[:, :, k]
                    Mt += np.matmul(np.matmul(Ak.T, Lambda_x), Ak)
                    A0 = A.copy()
                    A0[:, :, k] = 0
                    var5 = (X[t + time_lags[k], :] 
                            - np.matmul(ten2mat(A0, 0), X[t + time_lags[k] - time_lags, :].reshape([rank * d])))
                    Nt += np.matmul(np.matmul(Ak.T, Lambda_x), var5)
            var_mu = var4[:, t] + Nt + Qt
            if t < np.max(time_lags):
                inv_var_Lambda = inv(var3[:, :, t] + Mt - Lambda_x + np.eye(rank))
            else:
                inv_var_Lambda = inv(var3[:, :, t] + Mt)
            X[t, :] = mvnrnd(np.matmul(inv_var_Lambda, var_mu), inv_var_Lambda)
        mat_hat = np.matmul(W, X.T)
        
        X_new = np.zeros((dim2 + 1, rank))
        if iters + 1 > maxiter1 - maxiter2:
            X_new[0 : dim2, :] = X.copy()
            X_new[dim2, :] = np.matmul(ten2mat(A, 0), X_new[dim2 - time_lags, :].reshape([rank * d]))
            X_new_plus += X_new
            mat_hat_plus += np.matmul(W, X_new.T)
        
        tau = np.random.gamma(alpha + 0.5 * sparse_mat[position].shape[0], 
                              1/(beta + 0.5 * np.sum((sparse_mat - mat_hat)[position] ** 2)))
        rmse = np.sqrt(np.sum((dense_mat[pos] - mat_hat[pos]) ** 2)/dense_mat[pos].shape[0])
        if (iters + 1) % 200 == 0 and iters < maxiter1 - maxiter2:
            print('Iter: {}'.format(iters + 1))
            print('RMSE: {:.6}'.format(rmse))
            print()

    W = W_plus/maxiter2
    X_new = X_new_plus/maxiter2
    A = A_plus/maxiter2
    mat_hat = mat_hat_plus/maxiter2
    if maxiter1 >= 100:
        final_mape = np.sum(np.abs(dense_mat[pos] - mat_hat[pos])/dense_mat[pos])/dense_mat[pos].shape[0]
        final_rmse = np.sqrt(np.sum((dense_mat[pos] - mat_hat[pos]) ** 2)/dense_mat[pos].shape[0])
        print('Imputation MAPE: {:.6}'.format(final_mape))
        print('Imputation RMSE: {:.6}'.format(final_rmse))
        print()
    
    return mat_hat, W, X_new, A


sparse_mat = dfdealMis
dense_mat = dfdens
import time
start = time.time()
dim1, dim2 = sparse_mat.shape
rank = 10
time_lags = np.array([1, 2, (len(dfsecond)//28)])
init = {"W": 0.1 * np.random.rand(dim1, rank), "X": 0.1 * np.random.rand(dim2, rank)}
maxiter1 = 1100
maxiter2 = 100
a,b,c,d = BTMF(dense_mat, sparse_mat, init, rank, time_lags, maxiter1, maxiter2)
end = time.time()
print('Running time: %d seconds'%(end - start))

After imputation we got 'a', which is the array contains all 'Depth_to_Groundwater' variable values 
after filling in the missing values.
We use this array to replace the original data of 'Depth_to_Groundwater' variable in the dataset.
1. 

In [None]:
a = np.delete(a,-1,axis = 1)
dfRainfall = dfsecond[rain_list].to_numpy()
dfRainfall = np.delete(dfRainfall,range(len(dfsecond)-(len(dfsecond)//28)*28),axis = 0)
dfTemp = dfsecond[Temp_list].to_numpy()
dfTemp = np.delete(dfTemp,range(len(dfsecond)-(len(dfsecond)//28)*28),axis = 0)
pdate = pd.DataFrame(dfsecond['date'].values.astype('float32'), columns=['Date'])
dfDate = pdate['Date'].to_numpy()
dfDate = np.delete(dfDate,range(len(dfsecond)-(len(dfsecond)//28)*28),axis = 0)
dfDate = dfDate.reshape(-1,1)
a = a.T
wholedata = np.hstack((a,dfRainfall,dfTemp,dfDate))
wholelist = Depth_list+rain_list+Temp_list+['date']
newFrame = DataFrame(wholedata,index=None,columns = wholelist)

PCA

We found there are too many rainfall variables, so we decided to use PCA to reduce the number of these variables.

In [None]:
test=newFrame[rain_list]
test = test.ffill().bfill()

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

pca = PCA(n_components=10)
pca.fit(test)
v = pca.explained_variance_ratio_.round(2)

ax.bar(range(1,11),v)
plt.xlabel("PCA")
plt.title("variance")
plt.show()

We chose the first two PCA to replace the rainfall variables.

In [None]:
Xpca= PCA(n_components=2).fit_transform(test)
pf = pd.DataFrame(Xpca, columns=['PCA1','PCA2'])
newFrame['PCA1'] = pf['PCA1']
newFrame['PCA2'] = pf['PCA2']
for i in range(len(rain_list)):
    newFrame = newFrame.drop(rain_list[i],axis=1)
newFrame = newFrame.ffill().bfill()

We use ".shift(-28)" to creat there new variables, which are the target variables for the prediction model of this dataset.

In [None]:
lag_target = []
for i in range(len(target_variable)):
    newFrame[target_variable[i]+'28'] = newFrame[target_variable[i]].shift(-28)
    lag_target.append(target_variable[i]+'28')
newFrame

Correlation Matrix of the new dataset 

We checked MIC values between all the variables, and delete those variables had higher MIC values with some other variables.

In [None]:
def MIC_matirx(dataframe, mine):

    data_array = np.array(dataframe)
    n = len(data_array[0, :])
    output = np.zeros([n, n])

    for i in range(n):
        for j in range(n):
            mine.compute_score(data_array[:, i], data_array[:, j])
            output[i, j] = mine.mic()
            output[j, i] = mine.mic()
    mic_value = pd.DataFrame(output)
    return mic_value


mine = MINE(alpha=0.6, c=15)
Matrix_mic_value = MIC_matirx(newFrame, mine)

def HeatMap(DataFrame):
    %matplotlib inline
    colormap = plt.cm.RdBu
    plt.figure(figsize=(14,12))
    plt.title('MIC', y=1.05, size=15)
    sns.heatmap(DataFrame.astype(float),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
    plt.show()
HeatMap(Matrix_mic_value)

In [None]:
newFrame.columns.values.tolist()

According to the MIC matrix above, we can delete columns named 'Temperature_Ponte_a_Moriano' and 'Temperature_Lucca_Orto_Botanico'.

In [None]:
newFrame = newFrame.drop([ 'Temperature_Ponte_a_Moriano','Temperature_Lucca_Orto_Botanico'], axis=1)

# Seasonal and Trend

In [None]:
dfseason = pd.Series(newFrame['Depth_to_Groundwater_LT2'].tolist(),index = newFrame['date'].tolist())


decomposition = seasonal_decompose(dfseason, model='additive',period = 365,two_sided = False)
decomposition.plot()
plt.show()

We use the ADF to check the Stationarity of the target variable.

In [None]:
adfuller(dfseason)

In [None]:
dfseasonshift = dfseason.shift(-1)
dfseasondff = dfseason-dfseasonshift
dfseasondff = dfseasondff.dropna(inplace=False)
adfuller(dfseasondff)

In [None]:
plot_acf(dfseasondff)

In [None]:
plot_pacf(dfseasondff)

In [None]:
columns_name = newFrame.columns.values.tolist()

Depth_list = [ a for a in columns_name if a.startswith('Depth')]
Temp_list = [ a for a in columns_name if a.startswith('Temperature')]

# Kalman filter

Errors such as measurement errors will add some noise to the data. Which will affect the accuracy of prediction.
We used Kalman filter to remove this noise in Depth_to_Groundwater variables and temperature variables.

In [None]:
def Kalman1D(data,damping=1):
    observation_covariance = damping
    first_value = data[0]
    transition_matrix = 1
    transition_covariance = 0.1
    first_value
    kf = KalmanFilter(
            initial_state_mean=first_value,
            initial_state_covariance=observation_covariance,
            observation_covariance=observation_covariance,
            transition_covariance=transition_covariance,
            transition_matrices=transition_matrix
        )
    pred_state, state_cov = kf.smooth(data)
    return pred_state

In [None]:
dffull = newFrame[:len(newFrame)-28]
orenArray = dffull['Temperature_Orentano'].to_numpy()
orenkal = Kalman1D(orenArray,0.1)
plt.plot(np.array(list(range(len(newFrame)-28))),orenArray,label='measured')
plt.plot(np.array(list(range(len(newFrame)-28))),orenkal,label='kal')

plt.legend()
plt.show()

The orange line show the temperature value after the noise is eliminated.
We can see that it has become smoother than the blue line which contains the original value.

In [None]:
dffullkal = dffull.drop(Depth_list+Temp_list,axis =1 )
for i in range(len(Depth_list)):
    DepthArray = dffull[Depth_list[i]].to_numpy()
    Depthkal = Kalman1D(DepthArray,0.1)
    kallist = map(lambda x: x[0], Depthkal)
    Depthkalseries = pd.Series(kallist)
    dffullkal[Depth_list[i]] = Depthkalseries

for i in range(len(Temp_list)):
    TempArray = dffull[Temp_list[i]].to_numpy()
    Tempkal = Kalman1D(TempArray,0.1)
    kallist = map(lambda x: x[0], Tempkal)
    Tempkalseries = pd.Series(kallist)
    dffullkal[Temp_list[i]] = Tempkalseries

In [None]:
dffullkal

# LSTM

Recurrent Neural Network (RNN) is a neural network used to process sequence data. Compared with the general neural network, it can process the data of the sequence change.
Long short-term memory (Long short-term memory, LSTM) is a special RNN, mainly to solve the problem of gradient disappearance and gradient explosion in the training process of long sequences. Simply put, LSTM can perform better in longer sequences than ordinary RNNs.

In [None]:
class lstm_reg(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
        super(lstm_reg, self).__init__()
        
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers,dropout = 0.3) 
        self.reg = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x, _ = self.rnn(x) 
        s, b, h = x.shape
        x = x.view(s*b, h) 
        x = self.reg(x)
        x = x.view(s, b, -1)
        return x

In [None]:
dffullkal.columns.values.tolist()

Take 'Depth_to_Groundwater_SAL28' as the example.
if we want to predict the value of 'Depth_to_Groundwater_SAL28', the features we need is 'date','PCA1','PCA2','Depth_to_Groundwater_SAL', 'Temperature_Orentano','Temperature_Monte_Serra'.

In [None]:
n_test = int(((len(dffullkal)-28)/28)//5*28)
n_train = len(dffullkal)-28 - n_test

dftrain = dffullkal[:n_train]
dftest = dffullkal[n_train:len(dffullkal)-28]


dftrainX = dftrain[['date','PCA1','PCA2','Depth_to_Groundwater_SAL', 'Temperature_Orentano','Temperature_Monte_Serra']]
n_feature = len(dftrainX.columns.values.tolist())
dflistX = np.reshape(dftrainX.values.tolist(),(28,-1,n_feature))

dftrainY = dftrain['Depth_to_Groundwater_SAL28']
dflistY = np.reshape(dftrainY.values.tolist(),(28,-1,1))
dflistX = dflistX.astype('float32')
dflistY = dflistY.astype('float32')
tensorx = torch.from_numpy(dflistX)
tensory = torch.from_numpy(dflistY)

net = lstm_reg(n_feature, 100)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)

for e in range(100):
    var_x = Variable(tensorx)
    var_y = Variable(tensory)

    out = net(var_x)
    loss = criterion(out, var_y)

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(net.parameters(), 1.1)

    optimizer.step()
    if (e + 1) % 10 == 0: 
            print('Epoch: {}, Loss: {:.5f}'.format(e + 1, loss.data))
                


In [None]:
dftestX = dftest[['date','PCA1','PCA2','Depth_to_Groundwater_SAL', 'Temperature_Orentano','Temperature_Monte_Serra']]
dftestlistX = np.reshape(dftestX.values.tolist(),(28,-1,n_feature))

dftestY = dftest['Depth_to_Groundwater_SAL28']
dftestlistY = np.reshape(dftestY.values.tolist(),(28,-1,1))
dftestlistX = dftestlistX.astype('float32')
dftestlistY = dftestlistY.astype('float32')
tensortestx = torch.from_numpy(dftestlistX)
tensortesty = torch.from_numpy(dftestlistY)
testvar_x = Variable(tensortestx)
testvar_y = Variable(tensortesty)

nettest = net.eval()
pred_teste = nettest(testvar_x)
loss = criterion(pred_teste, testvar_y)
print('Epoch: {}, Loss: {:.5f}'.format('mse', loss.data))

a = nn.L1Loss()
maeloss = a(pred_teste, testvar_y)
print('Epoch: {}, Loss: {:.5f}'.format('mae', maeloss.data))

# Create Method

referance table

We used the missingno library to visualize the missing values of each table, and make a line chart to observe the distribution of missing values, and finally determine the range of data used to build the prediction model in each table.
We refer to the introduction of each table in the 'datasets_description.xlsx' to determine the output of each table and the variables(except for the outputs themselves) that may be used to predict these outputs.
We make this information into a table so that it can be used when needed.

In [None]:
referdata = {'table':['Aquifer_Auser','Aquifer_Doganella','Aquifer_Luco',
                     'Aquifer_Petrignano','Lake_Bilancino','River_Arno',
                      'Water_Spring_Amiata','Water_Spring_Lupa','Water_Spring_Madonna_di_Canneto'],
       'start':[4685,3075,6540,1000,1000,2250,5600,600,1600],#Get data begin from which row
       'end':[7000,3950,6950,5223,6000,3450,7487,4199,2500],#Stop getting data after reaching which row
        'feature':[['Rain','Temperature','date'],['Rain','Temperature','date'],
                  ['Rain','Temperature','date'],['Rain','Temperature','date'],
                  ['Rain','Temperature','date'],['Rain','Temperature','date'],
                  ['Rain','Temperature','date'],['Rain','date'],['Rain','Temperature','date']]}
referdf = DataFrame(referdata)
referdf

create a class to collect all the methods of processing data

In [None]:
class data_cook():
    
    

    def __init__(self, dataframe, start,end,target_variable):
        '''
        target_variable contains the name of all the variables which cound be used as target variable in this table.
        '''
        
        self.dfsecond = dataframe[start:end]# the start and end number could be check in the table 'referdf'.
        self.columns_name = dataframe.columns.values.tolist()
        self.Rain_list = [ a for a in self.columns_name if a.startswith('Rain')]
        self.Depth_list = [ a for a in self.columns_name if a.startswith('Depth')]
        self.Temp_list = [ a for a in self.columns_name if a.startswith('Temperature')]
        self.Flow_list = [ a for a in self.columns_name if a.startswith('Flow')]
        self.n_row = dataframe.shape[0]
        self.target_list = target_variable
    
    def BasicInformation(self):
        msno.matrix(self.dfsecond)
        print(self.columns_name)
        
    #BTMF
    def FillNullBTMF(self,nullValue_list):
        '''
        the nullValue_list contain the columns' name which we want to fill the null value.
        '''
        for i in range(len(self.Depth_list)):
            self.dfsecond[self.dfsecond[[self.Depth_list[i]]]==0]=np.nan
    
        for i in range(len(self.Temp_list)):
            self.dfsecond[self.dfsecond[[self.Temp_list[i]]]==0]=np.nan
        
        for i in range(len(self.Flow_list)):
            self.dfsecond[self.dfsecond[[self.Flow_list[i]]]==0]=np.nan
        
        dfmeasure = self.dfsecond[nullValue_list]
        dfdens = dfmeasure[nullValue_list]
        for i in range(len(nullValue_list)):
            dfdens[nullValue_list[i]] = dfdens[nullValue_list[i]].interpolate()

        dfdens = dfdens.ffill().bfill()
        dfdens = np.delete(dfdens.to_numpy().T,range(len(self.dfsecond)-(len(self.dfsecond)//28)*28),axis = 1)
        dfdealMis = np.delete(dfmeasure.fillna(0).to_numpy().T,range(len(self.dfsecond)-(len(self.dfsecond)//28)*28),axis = 1)

        def kr_prod(a, b):
            return np.einsum('ir, jr -> ijr', a, b).reshape(a.shape[0] * b.shape[0], -1)

        def cov_mat(mat):
            dim1, dim2 = mat.shape
            new_mat = np.zeros((dim2, dim2))
            mat_bar = np.mean(mat, axis = 0)
            for i in range(dim1):
                new_mat += np.einsum('i, j -> ij', mat[i, :] - mat_bar, mat[i, :] - mat_bar)
            return new_mat

        def ten2mat(tensor, mode):
            return np.reshape(np.moveaxis(tensor, mode, 0), (tensor.shape[mode], -1), order = 'F')

        def mat2ten(mat, tensor_size, mode):
            index = list()
            index.append(mode)
            for i in range(tensor_size.shape[0]):
                if i != mode:
                    index.append(i)
            return np.moveaxis(np.reshape(mat, list(tensor_size[index]), order = 'F'), 0, mode)

        def mnrnd(M, U, V):
            """
            Generate matrix normal distributed random matrix.
            M is a m-by-n matrix, U is a m-by-m matrix, and V is a n-by-n matrix.
            """
            dim1, dim2 = M.shape
            X0 = np.random.rand(dim1, dim2)
            P = np.linalg.cholesky(U)
            Q = np.linalg.cholesky(V)
            return M + np.matmul(np.matmul(P, X0), Q.T)

        def BTMF(dense_mat, sparse_mat, init, rank, time_lags, maxiter1, maxiter2):
            """Bayesian Temporal Matrix Factorization, BTMF."""
            W = init["W"]
            X = init["X"]

            d = time_lags.shape[0]
            dim1, dim2 = sparse_mat.shape
            pos = np.where((dense_mat != 0) & (sparse_mat == 0))
            position = np.where(sparse_mat != 0)
            binary_mat = np.zeros((dim1, dim2))
            binary_mat[position] = 1

            beta0 = 1
            nu0 = rank
            mu0 = np.zeros((rank))
            W0 = np.eye(rank)
            tau = 1
            alpha = 1e-6
            beta = 1e-6
            S0 = np.eye(rank)
            Psi0 = np.eye(rank * d)
            M0 = np.zeros((rank * d, rank))

            W_plus = np.zeros((dim1, rank))
            X_plus = np.zeros((dim2, rank))
            X_new_plus = np.zeros((dim2 + 1, rank))
            A_plus = np.zeros((rank, rank, d))
            mat_hat_plus = np.zeros((dim1, dim2 + 1))
            for iters in range(maxiter1):
                W_bar = np.mean(W, axis = 0)
                var_mu_hyper = (dim1 * W_bar)/(dim1 + beta0)
                var_W_hyper = inv(inv(W0) + cov_mat(W) + dim1 * beta0/(dim1 + beta0) * np.outer(W_bar, W_bar))
                var_Lambda_hyper = wishart(df = dim1 + nu0, scale = var_W_hyper, seed = None).rvs()
                var_mu_hyper = mvnrnd(var_mu_hyper, inv((dim1 + beta0) * var_Lambda_hyper))

                var1 = X.T
                var2 = kr_prod(var1, var1)
                var3 = tau * np.matmul(var2, binary_mat.T).reshape([rank, rank, dim1]) + np.dstack([var_Lambda_hyper] * dim1)
                var4 = (tau * np.matmul(var1, sparse_mat.T)
                        + np.dstack([np.matmul(var_Lambda_hyper, var_mu_hyper)] * dim1)[0, :, :])
                for i in range(dim1):
                    inv_var_Lambda = inv(var3[:, :, i])
                    W[i, :] = mvnrnd(np.matmul(inv_var_Lambda, var4[:, i]), inv_var_Lambda)
                if iters + 1 > maxiter1 - maxiter2:
                    W_plus += W

                Z_mat = X[np.max(time_lags) : dim2, :]
                Q_mat = np.zeros((dim2 - np.max(time_lags), rank * d))
                for t in range(np.max(time_lags), dim2):
                    Q_mat[t - np.max(time_lags), :] = X[t - time_lags, :].reshape([rank * d])
                var_Psi = inv(inv(Psi0) + np.matmul(Q_mat.T, Q_mat))
                var_M = np.matmul(var_Psi, np.matmul(inv(Psi0), M0) + np.matmul(Q_mat.T, Z_mat))
                var_S = (S0 + np.matmul(Z_mat.T, Z_mat) + np.matmul(np.matmul(M0.T, inv(Psi0)), M0) 
                         - np.matmul(np.matmul(var_M.T, inv(var_Psi)), var_M))
                Sigma = invwishart(df = nu0 + dim2 - np.max(time_lags), scale = var_S, seed = None).rvs()
                A = mat2ten(mnrnd(var_M, var_Psi, Sigma).T, np.array([rank, rank, d]), 0)
                if iters + 1 > maxiter1 - maxiter2:
                    A_plus += A

                Lambda_x = inv(Sigma)
                var1 = W.T
                var2 = kr_prod(var1, var1)
                var3 = tau * np.matmul(var2, binary_mat).reshape([rank, rank, dim2]) + np.dstack([Lambda_x] * dim2)
                var4 = tau * np.matmul(var1, sparse_mat)
                for t in range(dim2):
                    Mt = np.zeros((rank, rank))
                    Nt = np.zeros(rank)
                    if t < np.max(time_lags):
                        Qt = np.zeros(rank)
                    else:
                        Qt = np.matmul(Lambda_x, np.matmul(ten2mat(A, 0), X[t - time_lags, :].reshape([rank * d])))
                    if t < dim2 - np.min(time_lags):
                        if t >= np.max(time_lags) and t < dim2 - np.max(time_lags):
                            index = list(range(0, d))
                        else:
                            index = list(np.where((t + time_lags >= np.max(time_lags)) & (t + time_lags < dim2)))[0]
                        for k in index:
                            Ak = A[:, :, k]
                            Mt += np.matmul(np.matmul(Ak.T, Lambda_x), Ak)
                            A0 = A.copy()
                            A0[:, :, k] = 0
                            var5 = (X[t + time_lags[k], :] 
                                    - np.matmul(ten2mat(A0, 0), X[t + time_lags[k] - time_lags, :].reshape([rank * d])))
                            Nt += np.matmul(np.matmul(Ak.T, Lambda_x), var5)
                    var_mu = var4[:, t] + Nt + Qt
                    if t < np.max(time_lags):
                        inv_var_Lambda = inv(var3[:, :, t] + Mt - Lambda_x + np.eye(rank))
                    else:
                        inv_var_Lambda = inv(var3[:, :, t] + Mt)
                    X[t, :] = mvnrnd(np.matmul(inv_var_Lambda, var_mu), inv_var_Lambda)
                mat_hat = np.matmul(W, X.T)

                X_new = np.zeros((dim2 + 1, rank))
                if iters + 1 > maxiter1 - maxiter2:
                    X_new[0 : dim2, :] = X.copy()
                    X_new[dim2, :] = np.matmul(ten2mat(A, 0), X_new[dim2 - time_lags, :].reshape([rank * d]))
                    X_new_plus += X_new
                    mat_hat_plus += np.matmul(W, X_new.T)

                tau = np.random.gamma(alpha + 0.5 * sparse_mat[position].shape[0], 
                                      1/(beta + 0.5 * np.sum((sparse_mat - mat_hat)[position] ** 2)))
                rmse = np.sqrt(np.sum((dense_mat[pos] - mat_hat[pos]) ** 2)/dense_mat[pos].shape[0])
                if (iters + 1) % 200 == 0 and iters < maxiter1 - maxiter2:
                    print('Iter: {}'.format(iters + 1))
                    print('RMSE: {:.6}'.format(rmse))
                    print()

            W = W_plus/maxiter2
            X_new = X_new_plus/maxiter2
            A = A_plus/maxiter2
            mat_hat = mat_hat_plus/maxiter2
            if maxiter1 >= 100:
                final_mape = np.sum(np.abs(dense_mat[pos] - mat_hat[pos])/dense_mat[pos])/dense_mat[pos].shape[0]
                final_rmse = np.sqrt(np.sum((dense_mat[pos] - mat_hat[pos]) ** 2)/dense_mat[pos].shape[0])
                print('Imputation MAPE: {:.6}'.format(final_mape))
                print('Imputation RMSE: {:.6}'.format(final_rmse))
                print()

            return mat_hat, W, X_new, A

        sparse_mat = dfdealMis
        dense_mat = dfdens
        if (np.isnan(sparse_mat).any()==False):
            self.dfsecond = self.dfsecond.reset_index(drop = True)
            pdate = pd.DataFrame(self.dfsecond['date'].values.astype('float32'), columns=['Datefloat'])
            self.dfsecond['Datefloat'] = pdate['Datefloat']
            return self.dfsecond
        start = time.time()
        dim1, dim2 = sparse_mat.shape
        rank = 10
        time_lags = np.array([1, 2, (len(self.dfsecond)//28)])
        init = {"W": 0.1 * np.random.rand(dim1, rank), "X": 0.1 * np.random.rand(dim2, rank)}
        maxiter1 = 1100
        maxiter2 = 100
        a,b,c,d = BTMF(dense_mat, sparse_mat, init, rank, time_lags, maxiter1, maxiter2)
        end = time.time()
        print('Running time: %d seconds'%(end - start))

        a = np.delete(a,-1,axis = 1)
        dfRainfall = self.dfsecond[self.Rain_list].to_numpy()
        dfRainfall = np.delete(dfRainfall,range(len(self.dfsecond)-(len(self.dfsecond)//28)*28),axis = 0)
        dfFlow = self.dfsecond[self.Flow_list].to_numpy()
        dfFlow = np.delete(dfFlow,range(len(self.dfsecond)-(len(self.dfsecond)//28)*28),axis = 0)
        dfTemp = self.dfsecond[self.Flow_list].to_numpy()
        dfTemp = np.delete(dfTemp,range(len(self.dfsecond)-(len(self.dfsecond)//28)*28),axis = 0)
        pdate = pd.DataFrame(self.dfsecond['date'].values.astype('float32'), columns=['Datefloat'])
        dfDate = pdate['Datefloat'].to_numpy()
        dfDate = np.delete(dfDate,range(len(self.dfsecond)-(len(self.dfsecond)//28)*28),axis = 0)
        dfDate = dfDate.reshape(-1,1)
        a = a.T
        wholedata = np.hstack((a,dfRainfall,dfTemp,dfDate))
        wholelist = self.Depth_list+self.Rain_list+self.Temp_list+['Datefloat']
        newFrame = DataFrame(wholedata,index=None,columns = wholelist)
        self.dfsecond = newFrame
        self.dfsecond = self.dfsecond.reset_index(drop = True)
        return self.dfsecond

    def PCA_trans(self,feature_list):
        if len(feature_list)<=2:
            return self.dfsecond
        test=self.dfsecond[feature_list].ffill().bfill()
        Xpca= PCA(n_components=2).fit_transform(test)
        pf = pd.DataFrame(Xpca, columns=['PCA1','PCA2'])
        self.dfsecond['PCA1'] = pf['PCA1']
        self.dfsecond['PCA2'] = pf['PCA2']
        for i in range(len(feature_list)):
            self.dfsecond = self.dfsecond.drop(feature_list[i],axis=1)
        self.dfsecond = self.dfsecond.ffill().bfill()
        self.PCA_list = ['PCA1','PCA2']
        return self.dfsecond
    
    

    #use.shift(-28) made target variable
    def target_made(self,potential_list):
        self.lag_target=[]
        '''
        potential_list contains the name of all the variables which cound be seen as output in this table.
        '''
        for i in range(len(potential_list)):
            name = potential_list[i]+'28'
            self.dfsecond[name] = self.dfsecond[potential_list[i]].shift(-28)
            self.lag_target.append(name)      
        return self.dfsecond

    
    def MICMethod(self):
        '''
        use MICMethod to delete those variables which have higher MIC values with some other variables.
        '''
        mine = MINE(alpha=0.6, c=15)
        deldep_feature =[]
        deltem_feature =[]
        delflow_feature = []
        
        if((len(self.Depth_list)!=0)&(self.target_list[0] not in self.Depth_list)):
            dataDepth = self.dfsecond[self.Depth_list]
            data_array = np.array(dataDepth)
            n = len(data_array[0, :])
            for i in range(n):
                for j in range(n):
                    mine.compute_score(data_array[:, i], data_array[:, j])
                    if((mine.mic()>=0.9)&(i!=j)):
                        if (self.Depth_list[j] not in deldep_feature):
                            deldep_feature.append(self.Depth_list[i])
                            break
        
        if(len(self.Temp_list)!=0):
            dataTem = self.dfsecond[self.Temp_list]
            data_array = np.array(dataTem)
            n = len(data_array[0, :])
            for i in range(n):
                for j in range(n):
                    mine.compute_score(data_array[:, i], data_array[:, j])
                    if((mine.mic()>=0.9)&(i!=j)):
                        if (self.Temp_list[j] not in deltem_feature):
                            deltem_feature.append(self.Temp_list[i])
                            break
        
        if((len(self.Flow_list)!=0)&(self.target_list[0] not in self.Flow_list)):
            dataflow = self.dfsecond[self.Flow_list]
            data_array = np.array(dataflow)
            n = len(data_array[0, :])
            for i in range(n):
                for j in range(n):
                    mine.compute_score(data_array[:, i], data_array[:, j])
                    if((mine.mic()>=0.9)&(i!=j)):
                        if (self.Flow_list[j] not in delflow_feature):
                            delflow_feature.append(self.Flow_list[i])
                            break
        
        if len(self.PCA_list): 
            datapca = self.dfsecond[['PCA1','PCA2']]
            data_array = np.array(datapca)
            mine.compute_score(data_array[:, 0], data_array[:, 1])
            if(mine.mic()>=0.9):
                delpca_feature = ['PCA2']
        self.dfsecond = self.dfsecond.drop(deldep_feature+deltem_feature+delpca_feature+delflow_feature,axis =1)
        
        for i in range(len(deldep_feature)):
            self.Depth_list.remove(deldep_feature[i])
        for i in range(len(deltem_feature)):
            self.Temp_list.remove(deltem_feature[i])
        for i in range(len(delflow_feature)):
            self.Flow_list.remove(delflow_feature[i])
        self.PCA_list = ['PCA1']  
        return self.dfsecond
    
    def KalmanCook(self):
        '''
        used Kalman filter to remove noise in Depth_to_Groundwater,flow, and temperature variables.
        '''
        def Kalman1D(data,damping=1):
            observation_covariance = damping
            first_value = data[0]
            transition_matrix = 1
            transition_covariance = 0.1
            first_value
            kf = KalmanFilter(
                    initial_state_mean=first_value,
                    initial_state_covariance=observation_covariance,
                    observation_covariance=observation_covariance,
                    transition_covariance=transition_covariance,
                    transition_matrices=transition_matrix
                )
            pred_state, state_cov = kf.smooth(data)
            return pred_state
        
        dfreborn = self.dfsecond.drop(self.Depth_list+self.Temp_list+self.Flow_list,axis =1)
        for i in range(len(self.Depth_list)):
            tryArray = self.dfsecond[self.Depth_list[i]].to_numpy()
            trykal = Kalman1D(tryArray,0.1)
            kallist = map(lambda x: x[0], trykal)
            trykalseries = pd.Series(kallist)
            dfreborn[self.Depth_list[i]] = trykalseries

        for i in range(len(self.Temp_list)):
            tryArray = self.dfsecond[self.Temp_list[i]].to_numpy()
            trykal = Kalman1D(tryArray,0.1)
            kallist = map(lambda x: x[0], trykal)
            trykalseries = pd.Series(kallist)
            dfreborn[self.Temp_list[i]] = trykalseries
            
        for i in range(len(self.Flow_list)):
            tryArray = self.dfsecond[self.Flow_list[i]].to_numpy()
            trykal = Kalman1D(tryArray,0.1)
            kallist = map(lambda x: x[0], trykal)
            trykalseries = pd.Series(kallist)
            dfreborn[self.Flow_list[i]] = trykalseries
        self.dfsecond = dfreborn
        return self.dfsecond,self.lag_target
    
    
    def LSTMGo(self,target_variable):
        '''
        Target_variable is the name of the variable which would be used as dependent variable in the LSTM model.
        This method will print out the results of the training phase and the test phase, and return the forcast
        results of the last 28 days.
        '''
        self.target_variable = target_variable
        self.n_test = int(((len(self.dfsecond)-28)/28)//5*28)
        self.n_train = int(((len(self.dfsecond)-28)/28)//5*4*28)

        self.dftrain = self.dfsecond[:self.n_train]
        self.dftest = self.dfsecond[self.n_train:self.n_test+self.n_train]
        
        if(target_variable.startswith('Depth')):
            fake_target = [a for a in self.Depth_list if target_variable.startswith(a)][0]
        else:
            fake_target = [a for a in self.Flow_list if target_variable.startswith(a)][0]
        
        self.feature_name = ['Datefloat']+self.PCA_list+self.Temp_list+[fake_target]
        dftrainX = self.dftrain[self.feature_name]
        n_feature = len(dftrainX.columns.values.tolist())
        dflistX = np.reshape(dftrainX.values.tolist(),(28,-1,n_feature))

        dftrainY = self.dftrain[target_variable]
        dflistY = np.reshape(dftrainY.values.tolist(),(28,-1,1))
        dflistX = dflistX.astype('float32')
        dflistY = dflistY.astype('float32')
        tensorx = torch.from_numpy(dflistX)
        tensory = torch.from_numpy(dflistY)

        net = lstm_reg(n_feature, 100)
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)

        for e in range(100):
            var_x = Variable(tensorx)
            var_y = Variable(tensory)

            out = net(var_x)
            loss = criterion(out, var_y)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 1.1)#gradient clipping, used to avoid Exploding Gradients 

            optimizer.step()
            if (e + 1) % 10 == 0: 
                    print('Epoch: {}, Loss: {:.5f}'.format(e + 1, loss.data))
        
    
        dftestX = self.dftest[self.feature_name]
        n_feature = len(dftestX.columns.values.tolist())
        dftestlistX = np.reshape(dftestX.values.tolist(),(28,-1,n_feature))

        dftestY = self.dftest[self.target_variable]
        dftestlistY = np.reshape(dftestY.values.tolist(),(28,-1,1))
        dftestlistX = dftestlistX.astype('float32')
        dftestlistY = dftestlistY.astype('float32')
        tensortestx = torch.from_numpy(dftestlistX)
        tensortesty = torch.from_numpy(dftestlistY)
        testvar_x = Variable(tensortestx)
        testvar_y = Variable(tensortesty)

        nettest = net.eval()
        pred_teste = nettest(testvar_x)
        loss = criterion(pred_teste, testvar_y)
        print('Epoch: {}, Loss: {:.5f}'.format('mse', loss.data))

        a = nn.L1Loss()
        maeloss = a(pred_teste, testvar_y)
        print('Epoch: {}, Loss: {:.5f}'.format('mae', maeloss.data))
        
        dfpre = self.dfsecond.tail(28)
        dfpreX = dfpre[self.feature_name]
        n_feature = len(dfpreX.columns.values.tolist())
        dfprelistX = np.reshape(dfpreX.values.tolist(),(28,-1,n_feature))
        dfprelistX = dfprelistX.astype('float32')
        tensorprex = torch.from_numpy(dfprelistX)
        prevar_x = Variable(tensorprex)
        preY= net(prevar_x)
        return preY
        

class lstm_reg(nn.Module):
        def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
            super(lstm_reg, self).__init__()

            self.rnn = nn.LSTM(input_size, hidden_size, num_layers,dropout = 0.3) 
            self.reg = nn.Linear(hidden_size, output_size)

        def forward(self, x):
            x, _ = self.rnn(x) 
            s, b, h = x.shape
            x = x.view(s*b, h) 
            x = self.reg(x)
            x = x.view(s, b, -1)
            return x

        def output_y_hc(self, x, hc):
            y, hc = self.rnn(x, hc)  # y, (h, c) = self.rnn(x)
            s, b, h = y.size()
            y = y.view(s*b, h)
            y = self.reg(y)
            y = y.view(s, b, -1)
            return y, hc

**2- Random Forest**

**2.1- Predicting the Depth_to_Groundwater_SAL 28**

dftrain containing the training dataset while dftest containing the testing dataset. 

In [None]:

n_train = len(dffullkal)-28 - n_test

dftrain = dffullkal[:n_train]
dftest = dffullkal[n_train:len(dffullkal)-28]

dftrainX = dftrain[['date','PCA1','PCA2','Depth_to_Groundwater_SAL', 'Temperature_Orentano','Temperature_Monte_Serra']]
n_feature = len(dftrainX.columns.values.tolist())
dflistX = np.reshape(dftrainX.values.tolist(),(28,-1,n_feature))

dftrainY = dftrain['Depth_to_Groundwater_SAL28']
dflistY = np.reshape(dftrainY.values.tolist(),(28,-1,1))
dflistX = dflistX.astype('float32')
dflistY = dflistY.astype('float32')
tensorx = torch.from_numpy(dflistX)
tensory = torch.from_numpy(dflistY)
dftestX = dftest[['date','PCA1','PCA2','Depth_to_Groundwater_SAL', 'Temperature_Orentano','Temperature_Monte_Serra']]
dftestlistX = np.reshape(dftestX.values.tolist(),(28,-1,n_feature))

dftestY = dftest['Depth_to_Groundwater_SAL28']
dftestlistY = np.reshape(dftestY.values.tolist(),(28,-1,1))
dftestlistX = dftestlistX.astype('float32')
dftestlistY = dftestlistY.astype('float32')
tensortestx = torch.from_numpy(dftestlistX)
tensortesty = torch.from_numpy(dftestlistY)
testvar_x = Variable(tensortestx)
testvar_y = Variable(tensortesty)


In [None]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(dftrainX, dftrainY);
predictions = rf.predict(dftestX)

In [None]:
#RMSE
print('RMSE for predicting the Depth_to_Groundwater_SAL28 using the Random Forest')
RMSE= mean_squared_error(dftestY, predictions, squared=False)
print(RMSE)
#MAE
print('MAE for predicting the Depth_to_Groundwater_SAL28 using the Random Forest')
MAE=mean_absolute_error(dftestY, predictions)
print(MAE)

In [None]:
plt.figure()
plt.plot(dftestY,predictions,'bo')
plt.grid()
plt.xlabel('True')
plt.ylabel('Predicted')
plt.title('Predicting the Depth_to_Groundwater_SAL 28')
plt.show()

**3- The Steepest Descent algorithm**

**3.1- Predicting the Depth_to_Groundwater_SAL28**

In [None]:
w_hat_old = np.ones(len(dffullkal.columns))
rd = [random.randint(1,100) for i in range(1,9)]
w_hat =np.array(rd) # generating the same random value
y_hat_trainSAL= np.dot(dftrainX,w_hat_old)
# calculating the gradient 
X_train_transposeSAL = dftrainX.T
gradient_w_hat = -2*np.dot(dftrainX.T,dftrainY) + 2*np.dot(dftrainX.T,np.dot(dftrainX, w_hat))
# hessian matrix at point w_hat 
Hessian= 4*np.dot( X_train_transposeSAL,dftrainX)
iterations=0
max_iterations = 1e4
old_error=[]

In [None]:
#%%
# using the Steepest Descent algorithm 
w_hat_old = np.ones(len(dftrainX.columns))
rd = [random.randint(1,100) for i in range(1,9)]
w_hat =np.array(rd) # generating the same random value
y_hat_trainSAL= np.dot(dftrainX,w_hat_old)
# calculating the gradient 
X_train_transposeSAL = dftrainX.T
gradient_w_hat = -2*np.dot(dftrainX.T,dftrainY) + 2*np.dot(dftrainX.T,np.dot(dftrainX, w_hat))
# hessian matrix at point w_hat 
Hessian= 4*np.dot( X_train_transposeSAL,dftrainX)
iterations=0
max_iterations = 1e4
old_error=[]

In [None]:
while np.linalg.norm(w_hat- w_hat_old) > 1e-8 and iterations < max_iterations:
    iterations += 1
    #old_error += [train_error]                                           
    w_hat_old=w_hat
    gamma=np.linalg.norm(gradient_w_hat)**2/np.dot(np.dot(gradient_w_hat.T,Hessian),gradient_w_hat)
    w_hat = w_hat - gamma * gradient_w_hat # update the guess 
    #train_error=np.linalg.norm(np.dot(X_train,w_hat)- y_train)**2
    #hyp=np.dot(X_train, w_hat)
    #v = -y_train+hyp
    gradient_w_hat = -2*np.dot(dftrainX.T,dftrainY ) + 2*np.dot(dftrainX.T,np.dot(dftrainX, w_hat))

In [None]:
y_hat_trainSAL= np.dot(dftrainX,w_hat)

y_hat_testSAL = np.dot(dftestX,w_hat)

In [None]:
#MAE
print('MAE for predicting the Depth_to_Groundwater_SAL28 using the Steepest Descent in testing phase')
MAE=mean_absolute_error(dftestY, y_hat_testSAL)
print(MAE)

#RMSE
print('RMSE for predicting the Depth_to_Groundwater_SAL28 using the Steepest Descent in testing phase')
RMSE=mean_squared_error(dftestY, y_hat_testSAL, squared=False)
print(RMSE)

In [None]:
plt.figure()
plt.plot(dftestY,y_hat_testSAL,'bo')
plt.grid()
plt.xlabel('True')
plt.ylabel('Predicted')
plt.title('Depth_to_Groundwater_SAL28 prediction with Steepest Descent Algorithm ')
plt.show()

# recommend

We recommend using the LSTM model, which has more stable results and higher accuracy for large amounts of data.