In [None]:
!pip install pmdarima

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA

# Load specific forecasting tools
from statsmodels.tsa.statespace.sarimax import SARIMAX

from statsmodels.graphics.tsaplots import plot_acf,plot_pacf # for determining (p,q) orders
from statsmodels.tsa.seasonal import seasonal_decompose      # for ETS Plots
from pmdarima import auto_arima                              # for determining ARIMA orders

# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.style.use('dark_background')

In [None]:
df = pd.read_csv('../input/acea-water-prediction/Aquifer_Auser.csv', parse_dates=True, index_col='Date')


It appears that the erliest date to our target data availavle is from 01-01-2006. Hence we are going to work on the data form 2006 onwords.

In [None]:
df = df.loc['2006-01-01':]

Setting the frequency as Daily

In [None]:
df = df.asfreq('D')

# Deal with Null Values

In [None]:
df.isnull().sum()

In [None]:
cols = list(df.columns)

Let's fill 0 for mising rainfall.

In [None]:
df[cols[:10]]=df[cols[:10]].fillna(0)

Now let's create a function that fills with a regional average value. That means we would go few values bfore and after the missing value and fill with an average for that value. The function below would do that for us.

In [None]:
def fill_values(col_name=None, patch_size=5):
    #col_name = 'Depth_to_Groundwater_LT2'
    if col_name==None:
        print('Print Provide the column name')
        return []
    try:
        col_pos = cols.index(col_name)+1
    except:
        print('Invalid Column Name')
        return []
    patch_size = 5
    values = []
    for idx, row in enumerate(df.itertuples()):

        if row[col_pos] == row[col_pos]:
            value = row[col_pos]
        else:    
            value = np.mean(df.iloc[idx-patch_size:idx+patch_size][col_name])
            
            if value != value:
                 value = np.mean(df.iloc[idx:idx+2*patch_size][col_name])
                 if value != value:
                    value = np.mean(df[col_name]) # Fill by Global Mean

        values.append(value)
    return values
        

Let's fill the regioal average for rest of the mssing coulmns.

In [None]:
df['Depth_to_Groundwater_LT2'] = fill_values('Depth_to_Groundwater_LT2')
df['Depth_to_Groundwater_SAL'] = fill_values('Depth_to_Groundwater_SAL')
df['Depth_to_Groundwater_PAG'] = fill_values('Depth_to_Groundwater_PAG')
df['Depth_to_Groundwater_CoS'] = fill_values('Depth_to_Groundwater_CoS')
df['Depth_to_Groundwater_DIEC'] = fill_values('Depth_to_Groundwater_DIEC')
df['Hydrometry_Monte_S_Quirico'] = fill_values('Hydrometry_Monte_S_Quirico')
df['Hydrometry_Piaggione'] = fill_values('Hydrometry_Piaggione')
df['Temperature_Orentano'] = fill_values('Temperature_Orentano')
df['Temperature_Monte_Serra'] = fill_values('Temperature_Monte_Serra')
df['Temperature_Ponte_a_Moriano'] = fill_values('Temperature_Ponte_a_Moriano')
df['Temperature_Lucca_Orto_Botanico'] = fill_values('Temperature_Lucca_Orto_Botanico')
df['Volume_POL'] = fill_values('Volume_POL')
df['Volume_CC1'] = fill_values('Volume_CC1')
df['Volume_CC2'] = fill_values('Volume_CC2')
df['Volume_CSA'] = fill_values('Volume_CSA')
df['Volume_CSAL'] = fill_values('Volume_CSAL')


This functiona below will be used to plot the data for diffrent columns together.

In [None]:
def plot_data(start_day='2006-01-01', end_day=None, cols=None):
    start_day = start_day
    end_day = end_day
    fig, ax = plt.subplots()
  
    if cols==None:
        print('Please provide the target columns name')
        return
    else:
        df_plot = df[cols]
    
    df_plot.loc[start_day:end_day].plot(ax=ax,figsize=(15,8), cmap='Set1')
    if end_day == None :
        end_day = str(df.index[-1]).split()[0]
    plt.title(f' Daily Data plot form  {start_day} to {end_day}')
    plt.legend()


In [None]:
def plot_resample_data(start_day='2006-01-01', end_day=None, cols=None, resample_rule='W'):
    start_day = start_day
    end_day = end_day
    fig, ax = plt.subplots()
  
    if cols==None:
        print('Please provide the target columns name')
        return
    else:
        df_plot = df[cols]
    #df_plot['7DMA'] = df_plot[cols[0]].rolling(7).mean()
    df_plot_resample = df_plot[cols].resample(resample_rule).mean()
    df_plot_resample.loc[start_day:end_day].plot(ax=ax,figsize=(15,8), cmap='Set1')
    if end_day == None :
        end_day = str(df.index[-1]).split()[0]
    plt.title(f' Resample Data plot for {resample_dict[resample_rule]} form  {start_day} to {end_day}')
    plt.legend()

In [None]:
resample_dict = {'W':'Weekly','M':'Monthly','A':'Annually','QS':'Quarter Start'}

Let's Plot Daily for all the 5 Well Depth

In [None]:

target = ['Depth_to_Groundwater_CoS','Depth_to_Groundwater_SAL','Depth_to_Groundwater_PAG',
          'Depth_to_Groundwater_DIEC','Depth_to_Groundwater_LT2']
plot_data('2007-01-01',cols=target)

Let's resample the data for weekly , monthly quarterly and annually so that we can see smoother  distributions.

In [None]:
target = ['Depth_to_Groundwater_CoS','Depth_to_Groundwater_SAL','Depth_to_Groundwater_PAG',
          'Depth_to_Groundwater_DIEC','Depth_to_Groundwater_LT2']
plot_resample_data('2007-01-01',cols=target)

In [None]:
target = ['Depth_to_Groundwater_CoS','Depth_to_Groundwater_SAL','Depth_to_Groundwater_PAG',
          'Depth_to_Groundwater_DIEC','Depth_to_Groundwater_LT2']
plot_resample_data('2007-01-01',cols=target,resample_rule='M')

In [None]:
target = ['Depth_to_Groundwater_CoS','Depth_to_Groundwater_SAL','Depth_to_Groundwater_PAG',
          'Depth_to_Groundwater_DIEC','Depth_to_Groundwater_LT2']
plot_resample_data('2007-01-01',cols=target,resample_rule='QS')

In [None]:
target = ['Depth_to_Groundwater_CoS','Depth_to_Groundwater_SAL','Depth_to_Groundwater_PAG',
          'Depth_to_Groundwater_DIEC','Depth_to_Groundwater_LT2']
plot_resample_data('2007-01-01',cols=target,resample_rule='A')

In [None]:
plot_data(cols=['Depth_to_Groundwater_SAL'])

In [None]:
plot_data(cols=['Depth_to_Groundwater_CoS'])

In [None]:
plot_data(cols=['Depth_to_Groundwater_DIEC'])

In [None]:
ax = plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(), cmap='viridis', annot=True)

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, y='Depth_to_Groundwater_SAL', x='Volume_POL',cmap='viridis')
plt.legend()

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, y='Rainfall_Gallicano', x='Rainfall_Fabbriche_di_Vallico',cmap='viridis')
plt.legend()

In [None]:
ax = plt.subplots(figsize=(15,15))
sns.heatmap(df[cols[:10]].corr(), cmap='viridis', annot=True)

In [None]:
cols=df.columns

In [None]:
north_areas = ['Rainfall_Gallicano','Rainfall_Pontetetto','Rainfall_Borgo_a_Mozzano',
               'Rainfall_Calavorno','Rainfall_Fabbriche_di_Vallico']

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.heatmap(df[north_areas].corr(), cmap='viridis', annot=True)

This areas rainfaill is highly correlated , then we would just create one column takaing average of these rainfall. And remove tese 5 columns. Also we can safley assume that No rainfall data is zero rainfall. Hence we will fill the nnull with 0

In [None]:
df['north_avg_rainfall']=df[north_areas].apply(sum, axis=1)/5

In [None]:
df.drop(north_areas, axis=1,inplace=True)

In [None]:
cols=df.columns

In [None]:
ax = plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(), cmap='viridis', annot=True)

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, y='Rainfall_Orentano', x='Rainfall_Piaggione',cmap='viridis')
plt.legend()

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, y='Rainfall_Orentano', x='Rainfall_Monte_Serra',cmap='viridis')
plt.legend()


From the diagram above it appears that Rainfall for Monte_Serra, Orentano and Piaggione are higly corelated , it makes sense because these region belongs to the south. Hen will will drop these coluns and craete a new column as average rainfall in south region.

In [None]:
south_region = ['Rainfall_Monte_Serra', 'Rainfall_Orentano', 'Rainfall_Piaggione']

In [None]:
df['south_average_rainfall']=df[south_region].apply(sum, axis=1)/3

In [None]:
df.drop(south_region, axis=1, inplace = True)

In [None]:
 temp_cols = cols[7:11]

In [None]:
ax = plt.subplots(figsize=(7,7))
sns.heatmap(df[temp_cols].corr(), cmap='viridis', annot=True)

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, y='Temperature_Monte_Serra', 
                x='Temperature_Lucca_Orto_Botanico')
plt.legend()

As cen be seen in the digraam avove the these 2 temparature are extreamly correlated , hence we are going to drop the temperature column Temperature_Lucca_Orto_Botanico

In [None]:
df.drop('Temperature_Lucca_Orto_Botanico', axis=1, inplace=True)

In [None]:
temp_cols = df.columns[7:10]

In [None]:
ax = plt.subplots(figsize=(7,7))
sns.heatmap(df[temp_cols].corr(), cmap='viridis', annot=True)

In [None]:
df.columns

In [None]:
volume_cols = ['Volume_POL','Volume_CC1', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL']

In [None]:
volume_cols

In [None]:
ax = plt.subplots(figsize=(7,7))
sns.heatmap(df[volume_cols].corr(), cmap='viridis', annot=True)

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, x='Volume_CSA', 
                y='Volume_CSAL')
plt.legend()

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, x='Volume_POL', 
                y='Volume_CSAL')
plt.legend()

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, x='Volume_POL', 
                y='Volume_CC2')
plt.legend()

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, x='Volume_POL', 
                y='Volume_CSA')
plt.legend()

In [None]:
volume_cols = ['Volume_POL', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL']
ax = plt.subplots(figsize=(7,7))
sns.heatmap(df[volume_cols].corr(), cmap='viridis', annot=True)

In [None]:
df['Volume_CSA_CSAL']=df[['Volume_CSA','Volume_CSAL']].apply(sum,axis=1)

In [None]:
df['Volume_POL_CC2']=df[['Volume_POL','Volume_CC2']].apply(sum,axis=1)

In [None]:
df.drop(volume_cols, axis=1, inplace=True)

In [None]:
df.columns

In [None]:
volume_cols = ['Volume_CSA_CSAL','Volume_POL_CC2','Volume_CC1']

In [None]:
ax = plt.subplots(figsize=(7,7))
sns.heatmap(df[volume_cols].corr(), cmap='viridis', annot=True)

In [None]:
df['Volume_comb']=df[['Volume_CSA_CSAL','Volume_POL_CC2']].apply(sum,axis=1)

In [None]:
df.drop(['Volume_CSA_CSAL','Volume_POL_CC2'], axis=1,inplace=True)

In [None]:
volume_cols=['Volume_comb','Volume_CC1']

In [None]:
ax = plt.subplots(figsize=(10,8))
sns.scatterplot(data=df, x='Volume_CC1', 
                y='Volume_comb')
plt.legend()

In [None]:
ax = plt.subplots(figsize=(7,7))
sns.heatmap(df[volume_cols].corr(), cmap='viridis', annot=True)

In [None]:
df['Volume_comb']=df[['Volume_comb','Volume_CC1']].apply(sum,axis=1)

In [None]:
df.drop('Volume_CC1', axis=1, inplace=True)

In [None]:
ax = plt.subplots(figsize=(10,10))
sns.heatmap(df.corr(), cmap='viridis', annot=True)

All the the vlume comumsn are added together since they are highly corealtetd. Now let's have a look at the temperature columns

In [None]:
df.columns

Looks like temperatue columns are still highly corelated , lets create a average columns and delete individual columns.


In [None]:
df['Temperature_avg']=df[['Temperature_Monte_Serra', 'Temperature_Ponte_a_Moriano','Temperature_Orentano']].apply(sum,axis=1)

In [None]:
df.drop(['Temperature_Orentano','Temperature_Monte_Serra', 'Temperature_Ponte_a_Moriano'], axis=1, inplace=True)

In [None]:
df.isnull().sum()

Piaggione located in the north and Monte_S_Quirico is located in the south. We will now tyy to predict the notrh well first with the feature related to north. Then use this value to predict the south Well.

In [None]:
y=df['Depth_to_Groundwater_SAL']

In [None]:
X = df[['Hydrometry_Piaggione','Volume_comb','Temperature_avg','north_avg_rainfall',
     'Rainfall_Croce_Arcana','Rainfall_Tereglio_Coreglia_Antelminelli']]

In [None]:
X.isnull().sum()

# Data Modelling 
We have 3 target coulmns for this dataset. Depth_to_Groundwater_SAL, Depth_to_Groundwater_COS and Depth_to_Groundwater_LT2.  Out of which Depth_to_Groundwater_SAL, Depth_to_Groundwater_COS belongs to north region and Depth_to_Groundwater_LT2 belongs to South regin. Since Depth_to_Groundwater_LT2 is partially dependent on north region , we will first try to predict Depth_to_Groundwater_SAL, Depth_to_Groundwater_COS and use this value to prdict south region. For that let's cteate a data frame that can impact only north region.

In [None]:
df_north = df[['north_avg_rainfall','Rainfall_Croce_Arcana','Depth_to_Groundwater_SAL',
               'Rainfall_Tereglio_Coreglia_Antelminelli','Volume_comb','Hydrometry_Piaggione','Temperature_avg']]

In [None]:
train_len =int(0.7*len(df_north))
train = df_north[:train_len]
test = df_north[train_len:]

In [None]:
auto_arima(train['Depth_to_Groundwater_SAL'])

In [None]:
model = ARIMA(train['Depth_to_Groundwater_SAL'],order=(1,1,2))
results = model.fit()
results.summary()

In [None]:
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end,dynamic=False, typ='levels').rename('ARIMA(2,1,1) Predictions')

In [None]:
test['prediction']=predictions

In [None]:
test[['Depth_to_Groundwater_SAL','prediction']].plot(legend=True,figsize=(15,10), cmap='Set1')

In [None]:
from statsmodels.tools.eval_measures import mse,rmse

In [None]:
mse(predictions, test['Depth_to_Groundwater_SAL'])

In [None]:
exog_train= train[['north_avg_rainfall']]

exog_test= test[['north_avg_rainfall']]

In [None]:
model = ARIMA(train['Depth_to_Groundwater_SAL'],exog=exog_train,order=(1,1,2))
results = model.fit()
results.summary()

In [None]:
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end,dynamic=False, exog=exog_test, typ='levels').rename('ARIMA(2,1,1) Predictions')
rmse(predictions, test['Depth_to_Groundwater_SAL'])

In [None]:
test['prediction']=predictions
test[['Depth_to_Groundwater_SAL','prediction']].plot(legend=True,figsize=(15,10), cmap='Set1')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MaxAbsScaler

In [None]:
scaller = MaxAbsScaler()
scaller2 = MaxAbsScaler()

In [None]:
l=len(df_north)
test = df_north[int(.8*l):]
train = df_north[:int(.8*l)]

In [None]:
X_train=train
y_train = train['Depth_to_Groundwater_SAL']
y_train = pd.DataFrame(y_train, columns=['Depth_to_Groundwater_SAL'])
scalled_X_train = scaller.fit_transform(X_train)

In [None]:
scalled_y_train = scaller2.fit_transform(y_train)

In [None]:
test_cols = ['north_avg_rainfall', 'Rainfall_Croce_Arcana',
       'Depth_to_Groundwater_SAL', 'Rainfall_Tereglio_Coreglia_Antelminelli',
       'Volume_comb', 'Hydrometry_Piaggione', 'Temperature_avg']

In [None]:
n_input = 10
batch_size = 1

generator = TimeseriesGenerator(scalled_X_train, scalled_y_train, length=n_input, batch_size=batch_size)

In [None]:
# define model
n_features = 7
model = Sequential()
model.add(LSTM(n_input, activation='relu', input_shape=(n_input, n_features)))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
model.fit_generator(generator=generator, epochs=15)

In [None]:
model.history.history.keys()

In [None]:
loss_per_epoch = model.history.history['loss']
plt.plot(range(len(loss_per_epoch)),loss_per_epoch)

In [None]:
test_predictions = []
first_eval_batch = scalled_X_train[-n_input:]
current_batch = first_eval_batch.reshape((1, n_input, n_features))
for i in range(len(test)):
    
    # get prediction 1 time stamp ahead ([0] is for grabbing just the number instead of [array])
    current_pred = model.predict(current_batch)[0][0]
    
    # store prediction
    test_predictions.append(current_pred) 
    
    # update batch to now include prediction and drop first value
    sdf = test[i:i+1][test_cols]
    sdf['Depth_to_Groundwater_SAL'] = current_pred
    data = scaller.transform(sdf)
    current_batch = np.append(current_batch[:,1:,:],[data],axis=1)

In [None]:
test_predictions = np.array(test_predictions)
test_predictions = test_predictions.reshape(-1,1)
test_predictions = scaller2.inverse_transform(test_predictions)

In [None]:
test['prediction'] = test_predictions
test[['Depth_to_Groundwater_SAL','prediction']].plot(figsize=(15,8))


In [None]:
rmse(test['prediction'],test['Depth_to_Groundwater_SAL'])