In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The problem of defining the use of water has always been linked to the need to meet the water needs for civil uses, agriculture and industry. 
In Italy the most widely used sources of supply are surface waters that flow into the riverbeds of a hydrographic network, groundwater flowing through permeable areas of the subsoil. In this analysis an attempt is made to reconstruct the hydrographic trend in engineering terms using the data recorded by the pluviographs managed by the Acea company shared in this challenge. 

The analysis of the Datasets is based on the following parts:

* Screening Datasets 
* Climatic evaluation of the areas of interest
* Statistical evaluation by structuring the correlations 
* Data training and error evaluation with MSE and RMSE indexes 
* Estimation of the time of exhaustion of the water resource
* Forecast of the monthly seasonal trend.



# Aquifer Auser

In [None]:

df = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Auser.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.head()

Delete NaN values and Estimated climatic trend:

In [None]:
df1 = df.dropna(subset=['Rainfall_Gallicano', 'Rainfall_Pontetetto', 'Rainfall_Monte_Serra',
       'Rainfall_Orentano', 'Rainfall_Borgo_a_Mozzano', 'Rainfall_Piaggione',
       'Rainfall_Calavorno', 'Rainfall_Croce_Arcana',
       'Rainfall_Tereglio_Coreglia_Antelminelli',
       'Rainfall_Fabbriche_di_Vallico'])
rain_df = df1[['Rainfall_Gallicano', 'Rainfall_Pontetetto', 'Rainfall_Monte_Serra',
       'Rainfall_Orentano', 'Rainfall_Borgo_a_Mozzano', 'Rainfall_Piaggione',
       'Rainfall_Calavorno', 'Rainfall_Croce_Arcana',
       'Rainfall_Tereglio_Coreglia_Antelminelli',
       'Rainfall_Fabbriche_di_Vallico']]
temp_df = df1[['Temperature_Orentano', 'Temperature_Monte_Serra',
       'Temperature_Ponte_a_Moriano', 'Temperature_Lucca_Orto_Botanico']]


Detection zones chart


![1](https://imagizer.imageshack.com/img922/920/USigXi.png)

Red label: temperature measurement
Yellow Label: Rain Measure
Green label: both measurements

Water deficit evaluated with potential evapotranspiration index followed by S.I.I.
    

      hjm < Epjm  Deficit

In [None]:
rain_media = rain_df.resample('Y').mean()


# Sommo su tutti gli acquiferi che influenzano la falda
temp_df = df1[['Temperature_Orentano', 'Temperature_Monte_Serra',
       'Temperature_Ponte_a_Moriano', 'Temperature_Lucca_Orto_Botanico']]
temp_media = temp_df.resample('Y').mean()
h = rain_df.resample('m').mean()
temp_magg0 = np.array(temp_media[temp_media>0]*4.75,dtype = float)    
app = (temp_df.resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3
kj = np.array([0.82,0.81,1.03,1.11,1.26,1.27,1.29,1.19,1.05,0.93,0.77,0.78],dtype=float)
# kj Font S.I.I. sun exposure, Latitude: 42,5°

In [None]:
app = temp_df.resample('m').mean()
kpj = np.concatenate((kj,kj,kj)*5,axis=0)
kpj = np.concatenate((kj,kj,kj)*int(np.size(temp_media,0)/3),0)
kpj = kpj.repeat(4).reshape(np.size(h,0),np.size(temp_media,1))
aa = a.to_numpy(dtype=float).repeat(12,axis=0).reshape(np.size(h,0),np.size(temp_media,1))

#potential evapotranspiration
Epj = kpj*135*(app/26.5)**aa
Epj.head()


In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,10))
fig.suptitle('Potenzial Evapotraspiration Aqufer Auser')
T = Epj.iloc[:,:].values
H = h.iloc[:,:].values
for d in range(1,np.size(temp_media,1)+1):
    temp = T[:,d-1]
    alt = H[:,d-1]
    plt.subplot(2,2,d)
    ylab = 'h + Epj '+ temp_media.columns[d-1][12:]
    plt.plot(Epj.index,temp,color='red',label='Epj')
    plt.plot(h.index,alt,color='blue',label='h')
    plt.ylabel(ylab),plt.xlabel('time')
    plt.legend(loc='best')
    

The graph represents the Water Deficit which can be quantified through the area underlying the two trends.
As can be seen in the datasets, some values relating to the year 2009 are missing.
This water deficit is important in order to estimate the amount of rain reaching the aquifer.

Exploratory Data Analysis (EDA)

Create a dispersion matrix to represent the pairwise correlations between the different characteristics of the datasets.

In [None]:
import seaborn as sns
sns.set(style='whitegrid',context='notebook')
cols = ['Depth_to_Groundwater_SAL', 'Depth_to_Groundwater_PAG',
       'Depth_to_Groundwater_CoS', 'Depth_to_Groundwater_DIEC','Volume_POL', 'Volume_CC1', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL',
       'Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione']
df_drna = df.dropna()
sns.pairplot(df_drna[cols],size=2.5)


we can detect that there are anomalies in the Datasets in the columns of the Volumes taken, moreover the volume_CSA follow a similar normal distribution.

To quantify the linear relationship between the characteristics, a covariance matrix is created. In reality, the correlation matrix is identical to a covarinze matrix but calculated on standardized data.
The correlation matrix is a square matrix containing Pearson's coefficients 'r' which measure the linear dependence between pairs of characteristics. These coefficients vary between -1 and 1 where:
1 perfect linear correlation;
0 has no correlation;
-1 negative correlation;

In [None]:
cm = np.corrcoef(df_drna[cols].values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size' : 8},yticklabels=cols,xticklabels=cols)



Apply linear regression on terms that have a very high value of the Pierson coefficient.

In [None]:
from sklearn.linear_model import LinearRegression
slt = LinearRegression()
X = df_drna[['Hydrometry_Monte_S_Quirico']].values
y = df_drna['Depth_to_Groundwater_PAG'].values
slt.fit(X,y)
fig=plt.figure(figsize=(7,5))
sns.regplot(X,y,slt)
plt.xlabel('Hydrometry Monte S. Quirico')
plt.ylabel('Depth to Groundwater PAG')

print('slope: %.3f'% slt.coef_[0])
print('Intercept: %.3f' % slt.intercept_)

Performance evaluation of linear regression models

In [None]:

cols = ['Depth_to_Groundwater_SAL', 'Depth_to_Groundwater_PAG',
       'Depth_to_Groundwater_CoS', 'Depth_to_Groundwater_DIEC',
       'Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione']

vol = ['Volume_POL', 'Volume_CC1', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL']

df_drna = df.dropna()

df_new = df_drna[cols]
X = df_new.iloc[:,:].values
y = df_drna[vol].mean(axis=1).values
y = y.reshape(-1,1)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X)
y_std = sc_y.fit_transform(y)
slr=LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=0)
slr.fit(X_train,y_train)
y_train_pred = slr.predict(X_train)
y_test_pred = slr.predict(X_test)
plt.scatter(y_train_pred, y_train_pred - y_train, c='blue',marker='o',label='Train data')
plt.scatter(y_test_pred,y_test_pred-y_test, c = 'lightgreen',marker='s',label='Test data')
plt.hlines(0,xmin=-10000,xmax=-6000,lw=2,color='red')
plt.xlim([-10000,-6000])
plt.xlabel('Predicted values')
plt.ylabel('Residual')
plt.legend(loc='best')

Evaluation MSE, RMSE and R ^ 2 estimators

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
print('MSE Train: %.3f , Test: %.3f' % (mean_squared_error(y_train,y_train_pred),mean_squared_error(y_test,y_test_pred)))
print('RMSE Train: %.3f , Test: %.3f' % (mean_squared_error(y_train,y_train_pred)**0.5,mean_squared_error(y_test,y_test_pred)**0.5))
print('R^2 Train: %.3f , Test: %.3f' % (r2_score(y_train,y_train_pred)**0.5,r2_score(y_test,y_test_pred)**0.5))

there are too many anomalies in the datasets.


Now try to apply a non-linear regression and assume a polynomial of 2nd and 3rd degree.

In [None]:
X = df_drna[['Hydrometry_Monte_S_Quirico']].values
y = df_drna['Depth_to_Groundwater_PAG'].values


from sklearn.linear_model import LinearRegression
regr = LinearRegression()

# Creo polinomio 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X)
X_cubic = cubic.fit_transform(X)

#Linear fit
X_fit = np.arange(X.min(),X.max(),1)[:, np.newaxis]
regr = regr.fit(X,y)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y, regr.predict(X))

#Quadratic fit
regr = regr.fit(X_quad,y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y,regr.predict(X_quad))

#cubic fit
regr = regr.fit(X_cubic,y)
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
cubic_r2 = r2_score(y, regr.predict(X_cubic))

#plot results
fig=plt.figure(figsize=(15,7))
plt.scatter(X,y,label='punti di addestramento',color='lightgray')
plt.plot(X_fit,y_lin_fit,label='lineare (d=1), $R^2=%.2f$' % linear_r2,color='blue',lw=2,linestyle=':')
plt.plot(X_fit,y_quad_fit,label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2,color='red',lw=2,linestyle='-')
plt.plot(X_fit,y_cubic_fit,label='cubic (d=3), $R^2=%.2f$' % cubic_r2,color='green',lw=2,linestyle='--')
plt.xlabel('Hydrometry_Monte_S_Quirico')
plt.ylabel('Depth_to_groundwater_PAG')
plt.legend(loc='best')
 

Random forest method

In [None]:
X = df_new.iloc[:,:].values
y = df_drna[vol].mean(axis=1).values

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=0,n_jobs=-1)
forest.fit(X_train,y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
mean_squared_error(y_test,y_test_pred)
mean_squared_error(y_train,y_train_pred)-mean_squared_error(y_test,y_test_pred)
plt.scatter(y_train_pred,y_train_pred-y_train,c='black',marker='o',s=35,alpha=0.5,label='data train')
plt.scatter(y_test_pred,y_test_pred-y_test,c='lightgreen',marker='s',s=35,alpha=0.7,label='data test')
plt.hlines(y=0,xmin=-11000,xmax=-6000,color='red')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.title('Regressione a foreste casuali')
plt.legend(loc='best')




In [None]:
print('MSE Train: %.3f , Test: %.3f' %(mean_squared_error(y_train,y_train_pred),mean_squared_error(y_test,y_test_pred)))
print('RMSE Train: %.3f , Test: %.3f' %(mean_squared_error(y_train,y_train_pred)**0.5,mean_squared_error(y_test,y_test_pred)**0.5))
print('R^2 Train: %.3f , Test: %.3f' %(r2_score(y_train,y_train_pred),r2_score(y_test,y_test_pred)))

Using the random forest method results in an improvement in the data

Estimation of the aquifer emptying time

In groundwater Hypothesis which rests above a horizontal plane waterproof and has a height H practically constant both in time and in space, so that you have anywhere speed and very limited gradients, the law of exhaustion with a good approximation is given by the equation:

Q=Qo e^(-kt)

Where Qo is the scope of the instant chosen as the origin of the times, Q is the flow rate at time t and k is a constant, characteristic storage aquifer, which is defined constant exhaustion.
Now we do not have the data of the flow rate measured daily. Still force the concept by considering that the flow rate is equal to the daily collected volume divided 24h.
Q [m ^ 3 / h] and t [h]

The equation is write in the form:

log(Q) = log(Qo)-kt



In [None]:

df_drna = df.dropna()
vol = df_drna[['Volume_POL', 'Volume_CC1', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL']]

X = vol.iloc[:,:].values
Q = -X/24
t = np.arange(24,(np.size(Q,0)+1)*24,24)
t = t.reshape(-1,1)

fig = plt.figure(figsize=(20,10))
fig.suptitle('Acqufer Auser')
for i in range(1,np.size(vol,1)+1):
    q = Q[:,i-1]
    if sum(q==0)!=0:
        q = q[sum(q==0):]
        t = np.arange(24,(np.size(q)+1)*24,24)
        t = t.reshape(-1,1)
    q = np.log(q)
    slr = LinearRegression()
    slr.fit(t,q)
    plt.subplot(3,2,i)
    ylab = 'log(Q0) '+vol.columns[i-1][7:]
    sns.regplot(t,q,slr,label='k: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_))
    plt.xlabel('time')
    plt.ylabel(ylab)
    plt.legend(loc='best')
 

Forecast of the monthly seasonal trend of the Depth to GroundWater

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,10))
plt.suptitle('Depth to Groundwater')
cols = ['Depth_to_Groundwater_LT2',
       'Depth_to_Groundwater_SAL', 'Depth_to_Groundwater_PAG',
       'Depth_to_Groundwater_CoS', 'Depth_to_Groundwater_DIEC']
for i in range(1,len(cols)+1):
    df1 = df.dropna(subset=[cols[i-1]])
    df1[cols[0]][df1[cols[0]]==0]=df1[cols[0]].mean(axis=0)
    df_mou = df1.resample('m').mean()
    plt.subplot(3,2,i)
    df_mou[cols[i-1]].plot(),plt.ylabel(cols[i-1][21:])

These graphs represent the monthly trend of the various Depth to GroundWater to roughly verify the stationarity over time.
Now estimate the average delay over time through trend autocorrelation

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Partial Autocorrelation')
fig.subplots_adjust()
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[0]][df_drna[cols[0]]==0]=df_drna[cols[0]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    ax = fig.add_subplot(3,2,i)
    plot_pacf(df_mou[cols[i-1]],ax,title=None),plt.ylabel(cols[i-1][21:]),plt.xlabel('lag')
    

The graphs show a low volatility value so it is possible to make a forecast.
Now I separate the datasets into a train and Test part.

In [None]:
split = "2019-12-31"
fig = plt.figure(figsize=(20,10))
plt.suptitle('Train - Test')
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[0]][df_drna[cols[0]]==0]=df_drna[cols[0]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    train = df_mou[df_mou.index <= split]
    test = df_mou[df_mou.index >= split]
    test = test[test.index < '2020-07-31']
    fig, plt.subplot(3,2,i)
    train[cols[i-1]].plot(label='train'),test[cols[i-1]].plot(label='test')
    plt.ylabel(cols[i-1][21:])
    plt.legend(loc='best')


The model used is the SARIMAX where the values of AR (p) and MA (q) are estimated through Python's pmAutoArima method and with some seasonal considerations in the data.

In [None]:
fig = plt.figure(figsize=(15,8))
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[i-1]][df_drna[cols[i-1]]==0]=df_drna[cols[i-1]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    train = df_mou[df_mou.index <= split]
    test = df_mou[df_mou.index >= split]
    test = test[test.index < '2020-07-31']
    model = SARIMAX(train[cols[i-1]],order=(3,1,1),seasonal_order=(3,1,1,12),enforce_stationarity=False)
    fitted_model = model.fit(maxiter=200,disp=True)
    #fitted_model.summary() #sommario analisi
    train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
    in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
    out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
    fig = plt.figure(figsize=(15,8))
    plt.suptitle('SARIMA '+cols[i-1])
    plt.subplot(3,1,1)
    train[cols[i-1]].plot(label='train'),test[cols[i-1]].plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='best')
    plt.subplot(3,1,2)
    train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylim([-2,2])
    plt.subplot(3,1,3)
    in_sample['mean'].plot(label='Forecasted Values'),train[cols[i-1]].plot(label='Misured values'),plt.legend(loc='best'),plt.ylabel(cols[i-1])


# Aquifer Doganella




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Doganella.csv',index_col='Date')

df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.head()

In this case we don't have enough data so we will use a detailed drop

Detection zones chart

![1](https://imagizer.imageshack.com/img923/9213/sCoyo0.png)

Red label: temperature measurement
Yellow Label: Rain Measure
Green label: both measurements

In [None]:
df_raitemp = df[['Rainfall_Monteporzio', 'Rainfall_Velletri','Temperature_Monteporzio', 'Temperature_Velletri']]

###### Monteporzio $$$$$$$$$

df_new = df_raitemp.dropna(subset=['Rainfall_Monteporzio','Temperature_Monteporzio'])
##### I check amount data Temperature
df['Temperature_Monteporzio'].resample('Y').count()

There are few data for some years of observation, there will be a distortion of the evapotranspiration referred to the years reported

In [None]:
h1 = df_raitemp['Rainfall_Monteporzio'].resample('m').mean()
temp1_media = df_raitemp['Temperature_Monteporzio'].resample('Y').mean()

temp_magg0 = np.array(temp1_media[temp1_media>0]*4.75,dtype = float)

app = (df_raitemp['Temperature_Monteporzio'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
# kj Fonte S.I.I. tabellato e dipende dall'esposizione al sole, Latitudine: 42,5°

app = df_new['Temperature_Monteporzio'].resample('m').mean()
kpj = np.concatenate((kj,kj)*8,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)
#Stima del defici Idrico 
Epj1= kpj*135*(app/26.5)**aa
Epj1[Epj1.isna()] = Epj1.mean()
Epj1[Epj1>40] = Epj1.mean()
ylab = 'h + Epj '+ 'Monteporzio'
plt.subplot(2,1,1)
plt.plot(Epj1.index,Epj1,color='red',label='Epj'),plt.plot(h1.index,h1,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')
### Velletri

df_new = df_raitemp.dropna(subset=['Rainfall_Velletri','Temperature_Velletri'])
h2 = df_raitemp['Rainfall_Velletri'].resample('m').mean()
temp2_media = df_raitemp['Temperature_Velletri'].resample('Y').mean()

temp_magg0 = np.array(temp2_media[temp1_media>0]*4.75,dtype = float)

app = (df_raitemp['Temperature_Velletri'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
# kj Fonte S.I.I. tabellato e dipende dall'esposizione al sole, Latitudine: 42,5°

app = df_new['Temperature_Velletri'].resample('m').mean()
kpj = np.concatenate((kj,kj)*8,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)
#Stima del defici Idrico mensile per gli anni di osservazione
Epj2= kpj*135*(app/26.5)**aa
Epj2[Epj2.isna()] = Epj2.mean()
Epj2[Epj2>40] = Epj2.mean()
ylab = 'h + Epj '+ 'Velletri'
plt.subplot(2,1,2)
plt.plot(Epj2.index,Epj2,color='red',label='Epj'),plt.plot(h2.index,h2,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')


As anticipated, the evapotranspiration values for the years: 2015, 2016, 2017, 2018 have been replaced with the average

Exploratory Data Analysis (EDA)

In [None]:
app = df[['Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_2',
       'Depth_to_Groundwater_Pozzo_3', 'Depth_to_Groundwater_Pozzo_4',
       'Depth_to_Groundwater_Pozzo_5', 'Depth_to_Groundwater_Pozzo_6',
       'Depth_to_Groundwater_Pozzo_7', 'Depth_to_Groundwater_Pozzo_8',
       'Depth_to_Groundwater_Pozzo_9', 'Volume_Pozzo_1', 'Volume_Pozzo_2',
       'Volume_Pozzo_3', 'Volume_Pozzo_4', 'Volume_Pozzo_5+6',
       'Volume_Pozzo_7', 'Volume_Pozzo_8', 'Volume_Pozzo_9']].resample('Y').count()

app[app > 0].dropna()

It has reported the number of data relating to each year.
It notes that there is in the data omogeneotà so little information between Depth and Volume.
Then it evaluates the relationship between the 

In [None]:
import seaborn as sns
cols = ['Volume_Pozzo_1', 'Volume_Pozzo_2','Volume_Pozzo_3', 'Volume_Pozzo_4', 'Volume_Pozzo_5+6',
       'Volume_Pozzo_7', 'Volume_Pozzo_8', 'Volume_Pozzo_9']

df_drna = df.dropna(subset=cols)

sns.set(style='whitegrid',context='notebook')
sns.pairplot(df_drna[cols],size=2.5)


Notice a strong linear correlation between some volumes, we check the Pierson coefficients

In [None]:
cm = np.corrcoef(df_drna[cols].values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size' : 10},yticklabels=cols,xticklabels=cols)


Evaluate 4 linear relations

In [None]:
fig = plt.figure(figsize = (20,12))
from sklearn.linear_model import LinearRegression
slt = LinearRegression()
X = df_drna[['Volume_Pozzo_2']].values
y = df_drna['Volume_Pozzo_9'].values
slt.fit(X,y)
plt.subplot(2,2,1)
sns.regplot(X,y,slt,label='slope: %.5f Intercept: %.3f' % (slt.coef_[0],slt.intercept_)),plt.xlabel('Volume_Pozzo_2'),plt.ylabel('Volume_Pozzo_9'),plt.legend(loc='best')

X = df_drna[['Volume_Pozzo_3']].values
plt.subplot(2,2,2)
sns.regplot(X,y,slt,label='slope: %.5f Intercept: %.3f' % (slt.coef_[0],slt.intercept_)),plt.xlabel('Volume_Pozzo_3'),plt.ylabel('Volume_Pozzo_9'),plt.legend(loc='best')

X = df_drna[['Volume_Pozzo_4']].values
plt.subplot(2,2,3)
sns.regplot(X,y,slt,label='slope: %.5f Intercept: %.3f' % (slt.coef_[0],slt.intercept_)),plt.xlabel('Volume_Pozzo_4'),plt.ylabel('Volume_Pozzo_9'),plt.legend(loc='best')

X = df_drna[['Volume_Pozzo_7']].values
plt.subplot(2,2,4)
sns.regplot(X,y,slt,label='slope: %.5f Intercept: %.3f' % (slt.coef_[0],slt.intercept_)),plt.xlabel('Volume_Pozzo_7'),plt.ylabel('Volume_Pozzo_9'),plt.legend(loc='best')


Estimate a multiple regression and evaluate the MSE RMSE and R ^ 2 coefficients

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
slt = LinearRegression()
X = df_drna[cols].iloc[:,:-1]
y = df_drna['Volume_Pozzo_9'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=0)
slt.fit(X_train,y_train)
y_train_pred = slt.predict(X_train)
y_test_pred = slt.predict(X_test)
plt.scatter(y_train_pred, y_train_pred - y_train, c='blue',marker='o',label='Train Data'),plt.scatter(y_test_pred,y_test_pred-y_test, c = 'lightgreen',marker='s',label='Test Data'),plt.xlabel('Predicted Values'),plt.ylabel('residual'),plt.legend(loc='best')
plt.hlines(0,0,4000,color='red')

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
print('MSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('RMSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred)**0.5,mean_squared_error(y_test, y_test_pred)**0.5))
print('R^2: Train %.3f Test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))




Unfortunately, the linear regression doesn't involve an overfitting.



Random Forest Method:


In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=0,n_jobs=-1)
forest.fit(X_train,y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
mean_squared_error(y_test,y_test_pred)
mean_squared_error(y_train,y_train_pred)-mean_squared_error(y_test,y_test_pred)
plt.scatter(y_train_pred,y_train_pred-y_train,c='black',marker='o',s=35,alpha=0.5,label='Train Data'),plt.scatter(y_test_pred,y_test_pred-y_test,c='lightgreen',marker='s',s=35,alpha=0.7,label='Test Data'),plt.xlabel('Predicted Values'),plt.ylabel('Residual'),plt.title('Random Forest Method'),plt.legend(loc='best')
plt.hlines(0,0,4500,color='red')

In [None]:
print('MSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('RMSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred)**0.5,mean_squared_error(y_test, y_test_pred)**0.5))
print('R^2: Train %.3f Test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))



The regression to random forests led to an improvement




Calculate of the flow rate of aquifer exhaustion

In [None]:
X = df_drna[cols].iloc[:,:].values

Q = X/24
t = np.arange(24,(np.size(Q,0)+1)*24,24)
t = t.reshape(-1,1)

fig = plt.figure(figsize=(40,20))
fig.suptitle('Aqufer Doganella')
for i in range(1,np.size(cols)+1):
    q = Q[:,i-1]
    if sum(q==0)!=0:
        q = q[q >0]
        t = np.arange(24,(np.size(q)+1)*24,24)
        t = t.reshape(-1,1)
    q = np.log(q)
    slr = LinearRegression()
    slr.fit(t,q)
    plt.subplot(4,2,i)
    ylab = 'log(Q0) '+cols[i-1][7:]
    sns.regplot(t,q,slr,label='alpha: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_)),plt.xlabel('time'),plt.ylabel(ylab),plt.legend(loc='best')
 

Forecast Model:

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Depth to Groundwater')
cols = ['Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_2',
       'Depth_to_Groundwater_Pozzo_3', 'Depth_to_Groundwater_Pozzo_4',
       'Depth_to_Groundwater_Pozzo_5', 'Depth_to_Groundwater_Pozzo_6',
       'Depth_to_Groundwater_Pozzo_7', 'Depth_to_Groundwater_Pozzo_8',
       'Depth_to_Groundwater_Pozzo_9']
for i in range(1,len(cols)+1):
    df1 = df.dropna(subset=[cols[i-1]])
    df1[cols[0]][df1[cols[0]]==0]=df1[cols[0]].mean(axis=0)
    df_mou = df.resample('m').mean()
    plt.subplot(3,3,i)
    df_mou[cols[i-1]].plot(),plt.ylabel(cols[i-1][21:])

In this case the graphs do not seem to follow a stationary trend over time

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Partial Autocorrelation')
fig.subplots_adjust()
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[0]][df_drna[cols[0]]==0]=df_drna[cols[0]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    ax = fig.add_subplot(3,3,i)
    plot_pacf(df_mou[cols[i-1]],ax,title=None),plt.ylabel(cols[i-1][21:]),plt.xlabel('lag')
    

Noting the lag of the autocorrelation graphs, it can be seen that some trends do not have an acceptable level of significance

In [None]:
split = "2019-12-31"
fig = plt.figure(figsize=(20,10))
plt.suptitle('Train - Test')
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[0]][df_drna[cols[0]]==0]=df_drna[cols[0]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    train = df_mou[df_mou.index <= split]
    test = df_mou[df_mou.index >= split]
    test = test[test.index < '2020-07-31']
    fig, plt.subplot(3,3,i)
    train[cols[i-1]].plot(label='train'),test[cols[i-1]].plot(label='test')
    plt.ylabel(cols[i-1][21:])
    plt.legend(loc='best')

In [None]:
fig = plt.figure(figsize=(15,8))
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[i-1]][df_drna[cols[i-1]]==0]=df_drna[cols[i-1]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    train = df_mou[df_mou.index <= split]
    test = df_mou[df_mou.index >= split]
    test = test[test.index < '2020-07-31']
    model = SARIMAX(train[cols[i-1]],order=(3,1,1),seasonal_order=(3,1,1,12),enforce_stationarity=False)
    fitted_model = model.fit(maxiter=200,disp=True)
    #fitted_model.summary() #sommario analisi
    train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
    in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
    out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
    fig = plt.figure(figsize=(15,8))
    plt.suptitle('SARIMA '+cols[i-1])
    plt.subplot(3,1,1)
    train[cols[i-1]].plot(label='train'),test[cols[i-1]].plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='best')
    plt.subplot(3,1,2)
    train_residui.plot(label='Resudal'),plt.legend(loc='best')
    plt.subplot(3,1,3)
    in_sample['mean'].plot(label='Forecasted Values'),train[cols[i-1]].plot(label='Misured Values'),plt.legend(loc='best')


In this case, given the volatility of the data, different SARIMA model values should have been expressed for each datasets

# Aquifer Luco

In [None]:
df = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Luco.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)

df.isna().sum()/np.size(df,0)*100

represents the percentage of missing data for each characteristic of the datasets.

Evapotranspiration evaluation

In [None]:
rain_temp = df[['Rainfall_Simignano', 'Rainfall_Siena_Poggio_al_Vento',
       'Rainfall_Mensano', 'Rainfall_Montalcinello',
       'Rainfall_Monticiano_la_Pineta', 'Rainfall_Sovicille',
       'Rainfall_Ponte_Orgia', 'Rainfall_Scorgiano', 'Rainfall_Pentolina',
       'Rainfall_Monteroni_Arbia_Biena','Temperature_Siena_Poggio_al_Vento',
       'Temperature_Mensano', 'Temperature_Pentolina',
       'Temperature_Monteroni_Arbia_Biena']]
import seaborn as sns
###### Siena ######
df_drna = rain_temp.dropna(subset=['Rainfall_Simignano','Temperature_Siena_Poggio_al_Vento'])
app = (df_drna['Temperature_Siena_Poggio_al_Vento'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*10,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj1= kpj*135*(app/26.5)**aa

h1 = df_drna['Rainfall_Simignano'].resample('m').mean()
h1 = h1[Epj1>0]
Epj1 = Epj1[Epj1>0]

#%%########## Mensano #######
df_drna = rain_temp.dropna(subset=['Rainfall_Montalcinello','Temperature_Mensano'])
app = (df_drna['Temperature_Mensano'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3
kpj = np.concatenate((kj,kj)*9,0)
kpj = np.concatenate((kpj,kj))
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj2= kpj*135*(app/26.5)**aa

h2= df_drna['Rainfall_Montalcinello'].resample('m').mean()
h2 = h2[Epj2>0]
Epj2 = Epj2[Epj2>0]
#%%%####### Pentolina #######

df_drna = rain_temp.dropna(subset=['Rainfall_Montalcinello','Temperature_Pentolina'])
app = (df_drna['Temperature_Pentolina'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3
kpj = np.concatenate((kj,kj)*9,0)
kpj = np.concatenate((kpj,kj))
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj3= kpj*135*(app/26.5)**aa

h3 = df_drna['Rainfall_Montalcinello'].resample('m').mean()

#%%%##### Monteroni_Arbia_Biena #####

df_drna = rain_temp.dropna(subset=['Rainfall_Simignano','Temperature_Monteroni_Arbia_Biena'])
app = (df_drna['Temperature_Pentolina'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3
kpj = np.concatenate((kj,kj)*10,0)
kpj = np.concatenate((kpj,kj))
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj4= kpj*135*(app/26.5)**aa

h4 = df_drna['Rainfall_Simignano'].resample('m').mean()

### Plot Results
fig = plt.figure(figsize=(20,10))

plt.suptitle('Evapotraspiration - Rainfall')
plt.subplot(2,2,1)
ylab = ylab = 'h + Epj '+ 'Siena'
plt.plot(Epj1.index,Epj1,color='red',label='Epj'),plt.plot(h1.index,h1,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')

plt.subplot(2,2,2)
ylab = ylab = 'h + Epj '+ 'Mensano'
plt.plot(Epj2.index,Epj2,color='red',label='Epj'),plt.plot(h2.index,h2,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')

plt.subplot(2,2,3)
ylab = ylab = 'h + Epj '+ 'Pentolina'
plt.plot(Epj3.index,Epj3,color='red',label='Epj'),plt.plot(h3.index,h3,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')

plt.subplot(2,2,4)
ylab = ylab = 'h + Epj '+ 'Monteroni'
plt.plot(Epj4.index,Epj4,color='red',label='Epj'),plt.plot(h4.index,h4,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')


Estimation of the Rate flow of exhaustion

In [None]:
cols = ['Volume_Pozzo_1', 'Volume_Pozzo_3','Volume_Pozzo_4']

from sklearn.linear_model import LinearRegression

Q = df[cols]
Q = Q.resample('m').mean()
fig = plt.figure(figsize=(20,10))
fig.suptitle('Aquifer Luco')
for i in range(1,4):
    slr = LinearRegression()
    #ANBcontinuous.remove_outliers_by_boundaries(Q,cols[i-1],0.0,0.95)
    X = Q.iloc[:,i-1].dropna()
    X = X[X < 0]
    X = -X
    X = np.log(X)
    t = np.arange(1,(np.size(X)+1),1)
    t = t.reshape(-1,1)
    slr.fit(t,X)
    ylab = 'log(Q) ' + cols[i-1][7:]
    plt.subplot(2,2,i)
    sns.regplot(np.linspace(2015,2020,np.size(X,0)),X,slr,label='alpha: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_))
    plt.xlabel('t[h]')
    plt.ylabel(ylab)
    plt.legend(loc='best')

Exploratory Data Analysis (EDA)

In [None]:
cols = ['Depth_to_Groundwater_Podere_Casetta',
       'Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_3',
       'Depth_to_Groundwater_Pozzo_4','Volume_Pozzo_1', 'Volume_Pozzo_3',
       'Volume_Pozzo_4']

df_drna = df[cols].dropna()

sns.set(style='whitegrid',context='notebook')
# Matrice a Dispersione
fig = plt.figure(figsize=(20,10))
sns.pairplot(df_drna,size=2.5) 

Pierson's Coefficient

In [None]:
fig = plt.figure(figsize=(20,18))
cm = np.corrcoef(df_drna.values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size' : 20},yticklabels=df_drna.columns,xticklabels=df_drna.columns)


Random Forest Method 


In [None]:
cols = ['Depth_to_Groundwater_Podere_Casetta',
       'Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_3',
       'Depth_to_Groundwater_Pozzo_4']
df_new = df_drna[cols]
X = df_new.iloc[:,:].values
y = df_drna[['Volume_Pozzo_1', 'Volume_Pozzo_3',
       'Volume_Pozzo_4']].mean(axis=1).values
from sklearn.model_selection import train_test_split
#Valuto l'MSE e RMSE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
       
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=0,n_jobs=-1)
forest.fit(X_train,y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
plt.scatter(y_train_pred,y_train_pred-y_train,c='black',marker='o',s=35,alpha=0.5,label='dati di addestramento'),plt.scatter(y_test_pred,y_test_pred-y_test,c='lightgreen',marker='s',s=35,alpha=0.7,label='dati di test'),plt.xlabel('valori previsti'),plt.ylabel('residui aleatori'),plt.title('Regressione a foreste casuali'),plt.legend(loc='best')
plt.hlines(0,-220,-140,color='red')


Estimation coefficients MSE RMSE and R^2

In [None]:
print('MSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('RMSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred)**0.5,mean_squared_error(y_test, y_test_pred)**0.5))
print('R^2: Train %.3f Test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))



Forecast Model

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Depth to Groundwater')
cols = ['Depth_to_Groundwater_Podere_Casetta',
       'Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_3',
       'Depth_to_Groundwater_Pozzo_4']
for i in range(1,len(cols)+1):
    df1 = df.dropna(subset=[cols[i-1]])
    df1[cols[0]][df1[cols[0]]==0]=df1[cols[0]].mean(axis=0)
    df_mou = df1.resample('m').mean()
    plt.subplot(2,2,i)
    df_mou[cols[i-1]].plot(),plt.ylabel(cols[i-1][21:])

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Partial Autocorrelation')
fig.subplots_adjust()
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[0]][df_drna[cols[0]]==0]=df_drna[cols[0]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    ax = fig.add_subplot(2,2,i)
    plot_pacf(df_mou[cols[i-1]],ax,title=None),plt.ylabel(cols[i-1][21:]),plt.xlabel('lag')
    

In [None]:
fig = plt.figure(figsize=(15,8))
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[i-1]][df_drna[cols[i-1]]==0]=df_drna[cols[i-1]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    train = df_mou[df_mou.index <= split]
    test = df_mou[df_mou.index >= split]
    test = test[test.index < '2020-07-31']
    model = SARIMAX(train[cols[i-1]],order=(3,1,1),seasonal_order=(3,1,1,12),enforce_stationarity=False)
    fitted_model = model.fit(maxiter=200,disp=True)
    #fitted_model.summary() #sommario analisi
    train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
    in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
    out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
    fig = plt.figure(figsize=(15,8))
    plt.suptitle('SARIMA '+cols[i-1])
    plt.subplot(3,1,1)
    train[cols[i-1]].plot(label='train'),test[cols[i-1]].plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='best')
    plt.subplot(3,1,2)
    train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylim([-2,2])
    plt.subplot(3,1,3)
    in_sample['mean'].plot(label='Forecasted Values'),train[cols[i-1]].plot(label='Misured Values'),plt.legend(loc='best'),plt.ylabel(cols[i-1])


Given the scarcity of data, it was not possible to measure a future forecast of the Depth to groundWater

# Aquifer Petrignano

In [None]:
df = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Petrignano.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.isna().sum()/np.size(df,0)*100

 Potential Evapotranspiration Estimation 

In [None]:
###### Betia Umbra ######

df_drna = df.dropna()

app = (df_drna['Temperature_Bastia_Umbra'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*6,0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj1= kpj*135*(app/26.5)**aa

h = df_drna['Rainfall_Bastia_Umbra'].resample('m').mean()

#%%######### Petrignano #######

app = (df_drna['Temperature_Petrignano'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*6,0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj2 = kpj*135*(app/26.5)**aa

#%% Plot
fig = plt.figure(figsize=(16,8))
plt.suptitle('Evapotraspiration - Rainfall')
plt.subplot(2,1,1)
ylab = ylab = 'h + Epj '+ 'Bastia_Umbra'
plt.plot(Epj1.index,Epj1,color='red',label='Epj'),plt.plot(h.index,h,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')

plt.subplot(2,1,2)
ylab = ylab = 'h + Epj '+ 'Petrignano'
plt.plot(Epj2.index,Epj2,color='red',label='Epj'),plt.plot(h.index,h,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')



Flow Rate of exhaustion Estimation

In [None]:
import seaborn as sns
from sklearn.linear_model import LinearRegression

slr = LinearRegression()
Q = df['Volume_C10_Petrignano'].dropna()
Q = Q.resample('m').mean()
fig = plt.figure(figsize=(16,8))
fig.suptitle('Aquifer Luco')
X = Q.values
X = X[X < 0]
X = -X
X = np.log(X)
t = np.arange(1,(np.size(X)+1),1)
t = t.reshape(-1,1)

slr.fit(t,X)
sns.regplot(np.linspace(2007,2020,np.size(X,0)),X,slr,label='alpha: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_))
plt.xlabel('t[h]')
plt.ylabel('log(Q) Petrignano')
plt.legend(loc='best')

Exploratory Data Analysis (EDA)

In [None]:
cols = ['Depth_to_Groundwater_P24',
       'Depth_to_Groundwater_P25','Volume_C10_Petrignano',
       'Hydrometry_Fiume_Chiascio_Petrignano']


df_drna = df[cols].dropna()

sns.set(style='whitegrid',context='notebook')
# Matrice a Dispersione
fig = plt.figure(figsize=(22,13))
sns.pairplot(df_drna,size=2.5) 

Pierson's Coefficient

In [None]:
fig = plt.figure(figsize=(20,18))
cm = np.corrcoef(df_drna.values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size' : 20},yticklabels=df_drna.columns,xticklabels=df_drna.columns)


Random Forest Method

In [None]:
#from NBprocessing import NBcontinuous

cols = ['Depth_to_Groundwater_P24',
       'Depth_to_Groundwater_P25','Volume_C10_Petrignano','Hydrometry_Fiume_Chiascio_Petrignano','Volume_C10_Petrignano']
df_new = df_drna[cols]
#NBcontinuous.remove_outliers_by_boundaries(df_new,'Hydrometry_Fiume_Chiascio_Petrignano',0.0,0.97)

X = df_new.iloc[:,:-1].values
y = df_new['Volume_C10_Petrignano'].values
from sklearn.model_selection import train_test_split
#Valuto l'MSE e RMSE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
       
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=0,n_jobs=-1)
forest.fit(X_train,y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

plt.scatter(y_train_pred,y_train_pred-y_train,c='black',marker='o',s=35,alpha=0.5,label='dati di addestramento'),plt.scatter(y_test_pred,y_test_pred-y_test,c='lightgreen',marker='s',s=35,alpha=0.7,label='dati di test'),plt.xlabel('valori previsti'),plt.ylabel('residui aleatori'),plt.title('Regressione a foreste casuali'),plt.legend(loc='best')
plt.ylim([-100,100])
plt.hlines(0,-40000,0,color='red')



Estimation coefficients MSE RMSE and R^2

In [None]:
print('MSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('RMSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred)**0.5,mean_squared_error(y_test, y_test_pred)**0.5))
print('R^2: Train %.3f Test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))


Forecast Model

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Depth to Groundwater')
cols = ['Depth_to_Groundwater_P24',
       'Depth_to_Groundwater_P25']
for i in range(1,len(cols)+1):
    df = df.dropna(subset=[cols[i-1]])
    df[cols[0]][df[cols[0]]==0]=df[cols[0]].mean(axis=0)
    df_mou = df.resample('m').mean()
    plt.subplot(2,1,i)
    df_mou[cols[i-1]].plot(),plt.ylabel(cols[i-1][21:])

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Partial Autocorrelation')
fig.subplots_adjust()
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[0]][df_drna[cols[0]]==0]=df_drna[cols[0]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    ax = fig.add_subplot(3,2,i)
    plot_pacf(df_mou[cols[i-1]],ax,title=None),plt.ylabel(cols[i-1][21:]),plt.xlabel('lag')
    

In [None]:
fig = plt.figure(figsize=(15,8))
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[i-1]][df_drna[cols[i-1]]==0]=df_drna[cols[i-1]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    train = df_mou[df_mou.index <= split]
    test = df_mou[df_mou.index >= split]
    test = test[test.index < '2020-07-31']
    model = SARIMAX(train[cols[i-1]],order=(3,1,1),seasonal_order=(3,1,1,12),enforce_stationarity=False)
    fitted_model = model.fit(maxiter=200,disp=True)
    #fitted_model.summary() #sommario analisi
    train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
    in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
    out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
    fig = plt.figure(figsize=(15,8))
    plt.suptitle('SARIMA '+cols[i-1])
    plt.subplot(3,1,1)
    train[cols[i-1]].plot(label='train'),test[cols[i-1]].plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='upper left')
    plt.subplot(3,1,2)
    train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylim([-2,2])
    plt.subplot(3,1,3)
    in_sample['mean'].plot(label='Forecasted Values'),train[cols[i-1]].plot(label='Misured Values'),plt.legend(loc='best'),plt.ylabel(cols[i-1])


# Lake Bilancino

In [None]:
df = pd.read_csv('/kaggle/input/acea-water-prediction/Lake_Bilancino.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.isna().sum()/np.size(df,0)*100

represents the percentage of missing data.

Potenzial Evapotraspiration Evalutation

In [None]:
###### Le Croci ######

df_drna = df.dropna()

app = (df_drna['Temperature_Le_Croci'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*8,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj= kpj*135*(app/26.5)**aa

h = df_drna['Rainfall_Le_Croci'].resample('m').mean()
fig = plt.figure(figsize=(16,8))
ylab = ylab = 'h + Epj '+ 'Le_Croci'
plt.plot(Epj.index,Epj,color='red',label='Epj'),plt.plot(h.index,h,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')


Flow Rate Exhaustion Estimation

In [None]:
import seaborn as sns
from sklearn.linear_model import LinearRegression

slr = LinearRegression()
Q = df['Flow_Rate'].dropna()
Q = Q.resample('m').mean()
fig = plt.figure(figsize=(16,8))
fig.suptitle('Lake Bilancino')
X = Q.values
X = X[X > 0]
X = X*3600*24
X = np.log(X)
t = np.arange(1,(np.size(X)+1),1)
t = t.reshape(-1,1)
slr.fit(t,X)
sns.regplot(np.linspace(2007,2020,np.size(X,0)),X,slr,label='alpha: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_))
plt.xlabel('time[g]')
plt.ylabel('log(Q)')
plt.legend(loc='best')

Exploratory Data Analysis (EDA)

In [None]:
cols = ['Lake_Level','Flow_Rate']
df_drna = df[cols].dropna()

sns.set(style='whitegrid',context='notebook')
# Matrice a Dispersione
fig = plt.figure(figsize=(20,10))
sns.pairplot(df_drna,size=2.5)

Pierson's Coefficient

In [None]:
cm = np.corrcoef(df_drna.values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size' : 20},yticklabels=df_drna.columns,xticklabels=df_drna.columns)


Non linear regression

In [None]:
X = df_drna[['Lake_Level']].values
y = df_drna['Flow_Rate'].values

from sklearn.linear_model import LinearRegression
regr = LinearRegression()

# Creo polinomio 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X)
X_cubic = cubic.fit_transform(X)

#Linear fit
X_fit = np.arange(X.min(),X.max(),1)[:, np.newaxis]
regr = regr.fit(X,y)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y, regr.predict(X))

#Quadratic fit
regr = regr.fit(X_quad,y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y,regr.predict(X_quad))

#cubic fit
regr = regr.fit(X_cubic,y)
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
cubic_r2 = r2_score(y, regr.predict(X_cubic))

#plot results
plt.scatter(X,y,label='punti di addestramento',color='lightgray'),plt.plot(X_fit,y_lin_fit,label='lineare (d=1), $R^2=%.2f$' % linear_r2,color='blue',lw=2,linestyle=':'),plt.plot(X_fit,y_quad_fit,label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2,color='red',lw=2,linestyle='-'),plt.plot(X_fit,y_cubic_fit,label='cubic (d=3), $R^2=%.2f$' % cubic_r2,color='green',lw=2,linestyle='--'),plt.xlabel('Hydrometry_Monte_S_Quirico'),plt.ylabel('Depth_to_groundwater_PAG'),plt.legend(loc='best')
 

Forecast Model

In [None]:
cols = ['Lake_Level', 'Flow_Rate']

fig = plt.figure(figsize=(16,10))
plt.suptitle('Lake Bilancino')

for i in range(1,len(cols)+1):
    df_mou = df.resample('m').mean()
    plt.subplot(2,1,i)
    df_mou[cols[i-1]].plot(),plt.ylabel(cols[i-1])

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Partial Autocorrelation')
fig.subplots_adjust()
for i in range(1,len(cols)+1):
    ax = fig.add_subplot(2,1,i)
    plot_pacf(df_mou[cols[i-1]],ax,title=None),plt.ylabel(cols[i-1][21:]),plt.xlabel('lag')
  

In this case we have lake level and flow rate which are two completely different quantities, so we will apply the SARIMA method twice to best calibrate the parameters.

In [None]:
#%%  SARIMA   LAKE LEVEL
split = "2019-12-31"
fig = plt.figure(figsize=(15,8))
plt.suptitle('LAKE LEVEL')
train = df_mou[df_mou.index <= split]
test = df_mou[df_mou.index >= split]
test = test[test.index < '2020-07-31']
model = SARIMAX(train[cols[0]],order=(2,1,0),seasonal_order=(4,1,0,5),enforce_stationarity=False)
fitted_model = model.fit(maxiter=200,disp=True)
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
plt.subplot(3,1,1)
train[cols[0]].plot(label='train'),test[cols[0]].plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='upper left')
plt.subplot(3,1,2)
train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylim([-50,50]),plt.ylabel('Residual')
plt.subplot(3,1,3)
train[cols[0]].plot(label='misure Value'),in_sample['mean'].plot(label='Forecasted Values'),plt.legend(loc='best'),plt.ylim([200,300])


In [None]:
#%%  SARIMA   FLOW RATE
fig = plt.figure(figsize=(15,8))
plt.suptitle('FOLW RATE')
model = SARIMAX(train[cols[1]],order=(1,1,1),seasonal_order=(1,2,1,10),enforce_stationarity=False)
fitted_model = model.fit(maxiter=200,disp=True)
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
plt.subplot(3,1,1)
train[cols[1]].plot(label='train'),test[cols[1]].plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='upper left')
plt.subplot(3,1,2)
train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylim([-50,50]),plt.ylabel('Residual')
plt.subplot(3,1,3)
train[cols[1]].plot(label='misure Value'),in_sample['mean'].plot(label='Forecasted Values'),plt.legend(loc='best')


In this case, the forecast led to excellent results

# River Arno

In [None]:
df = pd.read_csv('/kaggle/input/acea-water-prediction/River_Arno.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.isna().sum()/np.size(df,0)*100

have a lot of data that is sparse missing

In [None]:
Potential Evapotraspiration Estimation 

In [None]:
df_drna = df['2004-01-01':'2016-12-31']
temp_media = df_drna['Temperature_Firenze'].resample('Y').mean()
app = (df_drna['Temperature_Firenze'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3
kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)

app = df_drna['Temperature_Firenze'].resample('m').mean()
kpj = np.concatenate((kj,kj)*6,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj= kpj*135*(app/26.5)**aa
rain = ['Rainfall_Le_Croci', 'Rainfall_Cavallina', 'Rainfall_S_Agata',
       'Rainfall_Mangona', 'Rainfall_S_Piero']
h = df_drna[rain].resample('m').mean()

fig = plt.figure(figsize=(20,10))
for d in range(1,5+1):
    
    
    plt.subplot(3,2,d)
    ylab = 'h + Epj '+ rain[d-1][9:]
    plt.plot(Epj.index,Epj,color='red',label='Epj'),plt.plot(h.index,h[rain[d-1]],color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='best')
    

In [None]:
Variation Hydrometer over Time

In [None]:
imetro = df['Hydrometry_Nave_di_Rosano'].dropna()
imetro  = imetro[imetro.isnull() == False ]

imetro_mean = imetro.resample('m').mean()
imetro_mean = imetro_mean[imetro_mean > 1]
fig = plt.figure()
mag = imetro_mean[imetro_mean > 2]
fig = plt.figure(figsize=(15,7))
plt.plot(imetro_mean.index,imetro_mean),plt.plot(mag.index,mag,'o'),plt.hlines(imetro.mean(),xmin=imetro_mean.index[0],xmax=imetro_mean.index[-1],color='red',label='mean'),plt.ylabel('Hydrometry_Nave_di_Rosano'),plt.legend(loc='best')


Forecast Model

In [None]:
idrometro = df['Hydrometry_Nave_di_Rosano'].dropna()
idrometro_mon = idrometro.resample('m').mean()
idrometro_mon.plot()

In [None]:
plot_pacf(idrometro_mon)

In [None]:
split = "2019-12-31"
fig = plt.figure(figsize=(15,8))
plt.suptitle('Hydrometry River Arno')
train = idrometro_mon[idrometro_mon.index <= split]
test = idrometro_mon[idrometro_mon.index >= split]
test = test[test.index < '2020-07-31']
model = SARIMAX(train,order=(2,1,3),seasonal_order=(2,1,3,6),enforce_stationarity=False)
fitted_model = model.fit(maxiter=200,disp=True)
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
plt.subplot(3,1,1)
train.plot(label='train'),test.plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='upper left')
plt.subplot(3,1,2)
train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylabel('Residual')
plt.subplot(3,1,3)
train.plot(label='misure Value'),in_sample['mean'].plot(label='Forecasted Values'),plt.legend(loc='best')


# Water Spring Amiata

In [None]:
df = pd.read_csv('/kaggle/input/acea-water-prediction/Water_Spring_Amiata.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.isna().sum()/np.size(df,0)*100

Potential Evapotraspiration Evalutation

In [None]:
### Abbiata ####

cols = ['Rainfall_Abbadia_S_Salvatore',
       'Rainfall_S_Fiora', 'Rainfall_Laghetto_Verde',
       'Temperature_Abbadia_S_Salvatore', 'Temperature_S_Fiora',
       'Temperature_Laghetto_Verde']

df_new = df[cols].resample('Y').count()

df_Abbiata = df[['Rainfall_Abbadia_S_Salvatore','Temperature_Abbadia_S_Salvatore']]

df_drna = df_Abbiata.dropna()


app = (df_drna['Temperature_Abbadia_S_Salvatore'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*5,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj1= kpj*135*(app/26.5)**aa

h1 = df_drna['Rainfall_Abbadia_S_Salvatore'].resample('m').mean()

### Fiora ####

df_Fiora = df[['Rainfall_S_Fiora','Temperature_S_Fiora']]
df_drna = df_Fiora.dropna()

app = (df_drna['Temperature_S_Fiora'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*4,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj2= kpj*135*(app/26.5)**aa

h2 = df_drna['Rainfall_S_Fiora'].resample('m').mean()

### Laghetto Verde ###

df_Laghetto = df[['Rainfall_Laghetto_Verde','Temperature_Laghetto_Verde']]
df_drna = df_Laghetto.dropna()

app = (df_drna['Temperature_Laghetto_Verde'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*4,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj3 = kpj*135*(app/26.5)**aa

h3 = df_drna['Rainfall_Laghetto_Verde'].resample('m').mean()

fig = plt.figure(figsize=(20,10))
plt.suptitle('Evapotranspiration - Rainfall')
plt.subplot(2,2,1)
ylab = ylab = 'h + Epj '+ 'Abbadia S. Salvatore'
plt.plot(Epj1.index,Epj1,color='red',label='Epj'),plt.plot(h1.index,h1,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')

plt.subplot(2,2,2)
ylab = ylab = 'h + Epj '+ 'S_Fiora'
plt.plot(Epj2.index,Epj2,color='red',label='Epj'),plt.plot(h2.index,h2,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')

plt.subplot(2,2,3)
ylab = ylab = 'h + Epj '+ ' Lghetto_Verde'
plt.plot(Epj3.index,Epj3,color='red',label='Epj'),plt.plot(h3.index,h3,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')



Rate Flow depletion Evalutation

In [None]:
from sklearn.linear_model import LinearRegression
import seaborn as sns
#from NBprocessing import NBcontinuous

cols = ['Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
       'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta']
slr = LinearRegression()
Q = df[cols]
Q = Q.resample('m').mean()
#Q = Q.dropna()
#Q = Q[Q < 0]

fig = plt.figure(figsize=(20,10))
fig.suptitle('Water Spring Amiata')
for i in range(1,5):
    slr = LinearRegression()
    #ANBcontinuous.remove_outliers_by_boundaries(Q,cols[i-1],0.0,0.95)
    X = Q.iloc[:,i-1].dropna()
    X = X[X < -0.3]
    X = -X
    X = np.log(X)
    t = np.arange(1,(np.size(X)+1),1)
    t = t.reshape(-1,1)
    slr.fit(t,X)
    ylab = 'log(Q) ' + cols[i-1][10:]
    plt.subplot(2,2,i)
    sns.regplot(np.linspace(2014,2020,np.size(X,0)),X,slr,label='alpha: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_))
    plt.xlabel('t[Y]')
    plt.ylabel(ylab)
    plt.legend(loc='best')

Exploratory Data Analysis (EDA)

In [None]:
sns.set(style='whitegrid',context='notebook')
# Matrice a Dispersione
cols=['Depth_to_Groundwater_S_Fiora_8', 'Depth_to_Groundwater_S_Fiora_11bis',
       'Depth_to_Groundwater_David_Lazzaretti','Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
       'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta']
fig = plt.figure(figsize=(20,10))
df_drna = df[cols].dropna()
sns.pairplot(df_drna,size=2.5)  

Pierson's Coefficient

In [None]:
fig = plt.figure(figsize=(20,18))
cm = np.corrcoef(df_drna.values.T)
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws={'size' : 20},yticklabels=df_drna.columns,xticklabels=df_drna.columns)

Non Linear Regression

In [None]:
X = df_drna[['Flow_Rate_Galleria_Alta']].values
y = df_drna['Depth_to_Groundwater_S_Fiora_8'].values

from sklearn.linear_model import LinearRegression
regr = LinearRegression()

# Creo polinomio 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)
X_quad = quadratic.fit_transform(X)
X_cubic = cubic.fit_transform(X)

#Linear fit
X_fit = np.arange(X.min(),X.max(),1)[:, np.newaxis]
regr = regr.fit(X,y)
y_lin_fit = regr.predict(X_fit)
linear_r2 = r2_score(y, regr.predict(X))

#Quadratic fit
regr = regr.fit(X_quad,y)
y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
quadratic_r2 = r2_score(y,regr.predict(X_quad))

#cubic fit
regr = regr.fit(X_cubic,y)
y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
cubic_r2 = r2_score(y, regr.predict(X_cubic))

#plot results
fig = plt.figure(figsize=(15,8))
plt.scatter(X,y,label='punti di addestramento',color='lightgray'),plt.plot(X_fit,y_lin_fit,label='lineare (d=1), $R^2=%.2f$' % linear_r2,color='blue',lw=2,linestyle=':'),plt.plot(X_fit,y_quad_fit,label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2,color='red',lw=2,linestyle='-'),plt.plot(X_fit,y_cubic_fit,label='cubic (d=3), $R^2=%.2f$' % cubic_r2,color='green',lw=2,linestyle='--'),plt.xlabel('Hydrometry_Monte_S_Quirico'),plt.ylabel('Depth_to_groundwater_PAG'),plt.legend(loc='best')
 

In [None]:
cols = ['Depth_to_Groundwater_S_Fiora_8', 'Depth_to_Groundwater_S_Fiora_11bis',
       'Depth_to_Groundwater_David_Lazzaretti']
df_new = df_drna[cols]
X = df_new.iloc[:,:].values
y = df_drna['Flow_Rate_Galleria_Alta'].values
from sklearn.model_selection import train_test_split
#Valuto l'MSE e RMSE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
       
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=1000,criterion='mse',random_state=0,n_jobs=-1)
forest.fit(X_train,y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
fig=plt.figure(figsize=(15,8))
plt.scatter(y_train_pred,y_train_pred-y_train,c='black',marker='o',s=35,alpha=0.5,label='dati di addestramento'),plt.scatter(y_test_pred,y_test_pred-y_test,c='lightgreen',marker='s',s=35,alpha=0.7,label='dati di test'),plt.xlabel('valori previsti'),plt.ylabel('residui aleatori'),plt.title('Regressione a foreste casuali'),plt.legend(loc='best')
plt.hlines(0,-26,-16,color='red')

MSE RMSE and R^2 coefficient Estimation

In [None]:
print('MSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('RMSE: Train %.3f Test: %.3f' % (mean_squared_error(y_train, y_train_pred)**0.5,mean_squared_error(y_test, y_test_pred)**0.5))
print('R^2: Train %.3f Test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))

Forecat Model

In [None]:
cols = ['Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
       'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta']

fig = plt.figure(figsize=(20,10))
plt.suptitle('Folw Rate')

for i in range(1,len(cols)+1):
    df1 = df.dropna(subset=[cols[i-1]])
    df1[cols[i-1]][df1[cols[i-1]]==0]=df1[cols[i-1]].mean(axis=0)
    df_mou = df.resample('m').mean()
    plt.subplot(2,2,i)
    df_mou[cols[i-1]].plot(),plt.ylabel(cols[i-1][10:])

In [None]:
fig = plt.figure(figsize=(20,10))
plt.suptitle('Partial Autocorrelation')
fig.subplots_adjust()
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[i-1]][df_drna[cols[i-1]]==0]=df_drna[cols[i-1]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    ax = fig.add_subplot(2,2,i)
    plot_pacf(df_mou[cols[i-1]],ax,title=None),plt.ylabel(cols[i-1][21:]),plt.xlabel('lag')
    

In [None]:
split = "2019-12-31"
fig = plt.figure(figsize=(15,8))
for i in range(1,len(cols)+1):
    df_drna = df.dropna(subset=[cols[i-1]])
    df_drna[cols[i-1]][df_drna[cols[i-1]]==0]=df_drna[cols[i-1]].mean(axis=0)
    df_mou = df_drna.resample('m').mean()
    train = df_mou[df_mou.index <= split]
    test = df_mou[df_mou.index >= split]
    test = test[test.index < '2020-07-31']
    model = SARIMAX(train[cols[i-1]],order=(1,1,1),seasonal_order=(4,1,2,4),enforce_stationarity=False)
    fitted_model = model.fit(maxiter=200,disp=True)
    #fitted_model.summary() #sommario analisi
    train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
    in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
    out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
    fig = plt.figure(figsize=(15,8))
    plt.suptitle('SARIMA '+cols[i-1])
    plt.subplot(3,1,1)
    train[cols[i-1]].plot(label='train'),test[cols[i-1]].plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='best')
    plt.subplot(3,1,2)
    train_residui.plot(label='Resudal'),plt.legend(loc='best')
    plt.subplot(3,1,3)
    in_sample['mean'].plot(label='Forecasted Values'),plt.legend(loc='best'),plt.ylabel(cols[i-1])


# Water Spring Lupa

In [None]:
df = pd.read_csv('/kaggle/input/acea-water-prediction/Water_Spring_Lupa.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.isna().sum()/np.size(df,0)*100

Flow Rate depletion Estimation 

In [None]:
from sklearn.linear_model import LinearRegression
import seaborn as sns
#from NBprocessing import NBcontinuous
#NBcontinuous.remove_outliers_by_boundaries(df,'Rainfall_Terni',0.0,0.95)
from sklearn.linear_model import LinearRegression
import seaborn as sns
#from NBprocessing import NBcontinuous
#NBcontinuous.remove_outliers_by_boundaries(df,'Rainfall_Terni',0.0,0.95)
df = df.dropna()

X = df.iloc[:,1].values
y = np.arange(1,np.size(X),1).reshape(-1,1)
q = -X
q = q[q>0]
q = np.log(q)
slr = LinearRegression()
slr.fit(y,q)
fig = plt.figure()
sns.regplot(y,q,slr,label='alpha: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_)),plt.ylabel('Log(Q)'),plt.xlabel('time'),plt.legend(loc='best')


In [None]:
rain = df['Rainfall_Terni'].resample('m').mean()
fig = plt.figure(figsize=(15,8))
plt.plot(rain.index,rain),plt.hlines(rain.mean(),rain.index[0],rain.index[-1],color='red',label='mean'),plt.legend(loc='best')

plt.ylabel('rain')

Forecast Model

In [None]:
cols = ['Flow_Rate_Lupa']
df = df.dropna()
df['Flow_Rate_Lupa'][df['Flow_Rate_Lupa']==0]=df['Flow_Rate_Lupa'].mean(axis=0)
df_mou = df.resample('m').mean()
df_mou['Flow_Rate_Lupa'].plot(),plt.ylabel(cols)


In [None]:
plot_pacf(df['Flow_Rate_Lupa'])

In [None]:
split = "2019-12-31"
idrometro_mon  = df_mou['Flow_Rate_Lupa']
fig = plt.figure(figsize=(15,8))
plt.suptitle('Flow Rate Lupa')
train = idrometro_mon[idrometro_mon.index <= split]
test = idrometro_mon[idrometro_mon.index >= split]
test = test[test.index < '2020-07-31']
model = SARIMAX(train,order=(2,1,3),seasonal_order=(2,1,3,6),enforce_stationarity=False)
fitted_model = model.fit(maxiter=200,disp=True)
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
plt.subplot(3,1,1)
train.plot(label='train'),test.plot(label='test'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='upper left')
plt.subplot(3,1,2)
train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylabel('Residual')
plt.subplot(3,1,3)
train.plot(label='misure Value'),in_sample['mean'].plot(label='Forecasted Values'),plt.legend(loc='best')


# Water Spring Madonna di Canneto

In [None]:
df = pd.read_csv('/kaggle/input/acea-water-prediction/Water_Spring_Madonna_di_Canneto.csv',index_col='Date')
df_time = pd.to_datetime(df.index)
datetime_index = pd.DatetimeIndex(df_time.values)
df = df.set_index(datetime_index)
df.isna().sum()/np.size(df,0)*100

Potential Evapotraspiration Evalutation.

In [None]:
df_drna = df.dropna(subset=['Rainfall_Settefrati', 'Temperature_Settefrati'])

app = (df_drna['Temperature_Settefrati'].resample('m').mean())**1.514/5
i = app.resample('Y').sum()
a = 0.49239+0.01792*i+0.0000771*i**2+6.75*10**-7*i**3

kj = np.array([0.84,0.83,1.03,1.11,1.24,1.25,1.27,1.18,1.04,0.96,0.83,0.81],dtype=float)
kpj = np.concatenate((kj,kj)*3,0)
kpj = np.concatenate((kpj,kj),0)
aa = a.to_numpy(dtype=float).repeat(12,axis=0)

Epj= kpj*135*(app/26.5)**aa

h = df_drna['Rainfall_Settefrati'].resample('m').mean()

ylab = ylab = 'h + Epj '+ 'Settefrati'
fig = plt.figure(figsize=(15,8))
plt.suptitle('Evapotranspiration - Rainfall')

plt.plot(Epj.index,Epj,color='red',label='Epj'),plt.plot(h.index,h,color='blue',label='h'),plt.ylabel(ylab),plt.xlabel('time'),plt.legend(loc='upper left')


Rate Flow depletion Evalutation

In [None]:
df_drna = df.dropna(subset=['Flow_Rate_Madonna_di_Canneto'])

X= df_drna.iloc[:,-1].values

q = np.log(X)
t = np.arange(1,np.size(q,0)+1,1).reshape(-1,1)
fig = plt.figure(figsize=(15,8))
from sklearn.linear_model import LinearRegression
import seaborn as sns
slr = LinearRegression()
slr.fit(t,q)
sns.regplot(t,q,slr,label='alpha: %.6f log(Q0): %.3f' % (slr.coef_[0],slr.intercept_)),plt.xlabel('time'),plt.ylabel('log(Q) Settefrati'),plt.legend(loc='best')


In [None]:
df = df.dropna()
df['Flow_Rate_Madonna_di_Canneto'][df['Flow_Rate_Madonna_di_Canneto']==0]=df['Flow_Rate_Madonna_di_Canneto'].mean(axis=0)
df_mou = df.resample('m').mean()
df_mou['Flow_Rate_Madonna_di_Canneto'].plot(),plt.ylabel('Flow_Rate')

In [None]:
plot_pacf(df_mou['Flow_Rate_Madonna_di_Canneto'])

In [None]:
split = "2019-12-31"
idrometro_mon = df_mou['Flow_Rate_Madonna_di_Canneto']
fig = plt.figure(figsize=(15,8))
plt.suptitle('Flow Rate')
train = idrometro_mon
model = SARIMAX(train,order=(2,1,3),seasonal_order=(2,1,3,6),enforce_stationarity=False)
fitted_model = model.fit(maxiter=200,disp=True)
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
train_residui = fitted_model.resid #residui aleatori [train - train_predicted]
in_sample = fitted_model.get_prediction(end=fitted_model.nobs).summary_frame() # Andamento della previsione
out_of_simple = fitted_model.get_prediction(start = fitted_model.nobs,end=fitted_model.nobs+10).summary_frame()
plt.subplot(3,1,1)
train.plot(label='train'),out_of_simple['mean'].plot(lw=2,label='Prediction'),plt.fill_between(out_of_simple.index,out_of_simple['mean_ci_lower'],out_of_simple['mean_ci_upper'],color='k',alpha=.10,label='mean error'),plt.legend(loc='upper left')
plt.subplot(3,1,2)
train_residui.plot(label='Resudal'),plt.legend(loc='best'),plt.ylabel('Residual')
plt.subplot(3,1,3)
train.plot(label='misure Value'),in_sample['mean'].plot(label='Forecasted Values'),plt.legend(loc='best')


In the evaluation of the Datasets it has been shown that some machine learning algorithms can improve the predictions of the Italian supply sources. In the future it would be interested to study hydrological phenomena in more depth, using data from pluviographs in such a way as to be able to better estimate the pluviometric probability curves and to be able to better estimate the flow rates in aquifers in springs or even better in Italian rivers.
Thanks to Acea for this challenge.
Gianmario Farina.