In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
base_dir='/kaggle/input/acea-water-prediction'

In [None]:
os.listdir(base_dir)

In [None]:
aq_d=pd.read_csv(os.path.join(base_dir,'Aquifer_Doganella.csv'))
aq_d.head()

In [None]:
def find_missing_values(df):
    missing_val_tbl=pd.DataFrame(df.isnull().sum(),columns=['Count'])    
    return missing_val_tbl

In [None]:
def display_missing_val(missing_val_tbl, title):
    
    plt.figure(figsize=(15,15))
    missing_val_tbl.sort_values(by='Count',ascending=True).plot(kind='barh')
    plt.title(title)
    plt.show()

In [None]:
display_missing_val(find_missing_values(aq_d),'Missing values for Aquifer_Doganella')

In [None]:
aq_a=pd.read_csv(os.path.join(base_dir,'Aquifer_Auser.csv'))
aq_a.head()

In [None]:
display_missing_val(find_missing_values(aq_a),'Missing values for Aquifer_Auser')

In [None]:
aq_l=pd.read_csv(os.path.join(base_dir,'Aquifer_Luco.csv'))
aq_l.head()

In [None]:
display_missing_val(find_missing_values(aq_l),'Missing values for Aquifer_Luco')

In [None]:
aq_p=pd.read_csv(os.path.join(base_dir,'Aquifer_Petrignano.csv'))
aq_p.head()

In [None]:
display_missing_val(find_missing_values(aq_p),'Missing values for Aquifer_Petrignano')

# Water Spring

In [None]:
ws_a=pd.read_csv(os.path.join(base_dir,'Water_Spring_Amiata.csv'))
ws_a.head()

In [None]:
display_missing_val(find_missing_values(ws_a),'Missing values for Water_Spring_Amiata')

In [None]:
ws_m=pd.read_csv(os.path.join(base_dir,'Water_Spring_Madonna_di_Canneto.csv'))
ws_m.head()

In [None]:
display_missing_val(find_missing_values(ws_m),'Missing values for Water_Spring_Madonna_di_Canneto')

In [None]:
ws_l=pd.read_csv(os.path.join(base_dir,'Water_Spring_Lupa.csv'))
ws_l.head()

In [None]:
display_missing_val(find_missing_values(ws_m),'Missing values for Water_Spring_Lupa')

In [None]:
all_ws={'Amiata':ws_a,'Madonna_di_Canneto':ws_m,'Lupa':ws_l}

In [None]:
def analyse_heatmap(src,title='', thres=.6):
    print(f'Heatmap for {title}')
    fig, ax=plt.subplots(1,1,figsize=(15,12))

    corr=src.corr()
    mask=np.tril(np.ones_like(corr, dtype=bool))
    mask[(abs(corr)<thres) & (mask==False)]=True
    sns.heatmap(corr,mask=mask,ax=ax,vmin=-1, vmax=1, annot=True)
    ax.hlines(range(0, src.shape[1]),*ax.get_xlim(), lw=1)
    ax.vlines(range(0, src.shape[1]), *ax.get_xlim(), lw=1)
    plt.show()

In [None]:
analyse_heatmap(ws_a, 'Amiata' )

In [None]:
analyse_heatmap(ws_m,'Madonna_di_Canneto',0)

In [None]:
print(aq_d.info())

In [None]:
aq_d.shape

In [None]:
aq_d.columns

# Rivers

In [None]:
river_a=pd.read_csv(os.path.join(base_dir,'River_Arno.csv'))
river_a.head()

In [None]:
display_missing_val(find_missing_values(river_a),'Missing values for River_Arno')

In [None]:
fig, ax=plt.subplots(1,1,figsize=(15,12))

corr=river_a.corr()
mask=np.tril(np.ones_like(corr, dtype=bool))
mask[(abs(corr)<.6) & (mask==False)]=True
sns.heatmap(corr,mask=mask,ax=ax,vmin=-1, vmax=1, annot=True)
ax.hlines(range(0, river_a.shape[1]),*ax.get_xlim(), lw=1)
ax.vlines(range(0, river_a.shape[1]), *ax.get_xlim(), lw=1)
plt.show()

In [None]:
corr_pos=corr.abs()
hi_corr_val=np.where(corr_pos>.6)
hi_corr_val= [ (corr_pos.columns[x], corr_pos.columns[y]) for x, y in zip(*hi_corr_val) if x!=y and x<y]

There is correlation between the rainfall in different areas

In [None]:
# plotting correlated features
river_a.index=river_a.Date
df=river_a.loc['01/01/2020':'01/05/2020']
fig, ax=plt.subplots(5,3, figsize=(20,20))
ax=ax.ravel()
for i in range(len(hi_corr_val[:15])):    
    ax[i]=df[[hi_corr_val[i][0],hi_corr_val[i][1],'Date']].plot(x='Date', ax=ax[i])
    ax[i].set_title(hi_corr_val[i][0]+' vs '+hi_corr_val[i][1])
plt.subplots_adjust(wspace=.3, hspace=.3)
plt.show()


In [None]:
river_a.index=river_a.Date
river_a=river_a.drop(columns=['Date'], axis=1)

In [None]:

riv_cols=river_a.columns
fig , ax= plt.subplots(4,len(riv_cols)//4,figsize=(30,25))
axes=ax.ravel()

for  i in range(len(river_a.columns)):    
    axes[i]=river_a[riv_cols[i]].plot(ax=axes[i])
    axes[i].set_title(riv_cols[i])
plt.subplots_adjust(wspace=.3, hspace=.3)
plt.show()

In [None]:
river_a.Hydrometry_Nave_di_Rosano.rolling(window=12).mean().plot(figsize=(10,8))

In [None]:
df_r=river_a.Hydrometry_Nave_di_Rosano

In [None]:
train=df_r.loc['01/01/2020':'30/05/2020']
test=df_r.loc['01/06/2020':]

In [None]:
train=train.dropna()

In [None]:
# Fit an AR model
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.ar_model import AR, ARResults

In [None]:
model=AR(train)


In [None]:
ARfit=model.fit(method='mle')
print(f'Lag: {ARfit.k_ar}')
print(f'Coeff: {ARfit.params}')

In [None]:
start=len(train)
end=len(train)+len(test)-1
preds=ARfit.predict(start=start, end=end, dynamic=False).rename('AR_Preds_River_Level')

In [None]:
train.plot(legend=True)
preds.plot(legend=True, figsize=(10,8))

In [None]:
# Evaluate the model
from sklearn.metrics import mean_squared_error
print(f'MSE:{mean_squared_error(test, preds)}')

In [None]:
# Forecast using stat models
df_r=df_r.dropna()
#df_r.index.freq='MS'
model=AR(df_r)
ARfit=model.fit(maxlag=13, model='mle')
fcast=ARfit.predict(start=len(df_r), end=len(df_r)+480, dynamic=False).rename('Forecast')
df_r.plot(legend=True)
fcast.plot(legend=True, figsize=(10,8));

 # Test for stationarity
A time series is stationary if the mean and variance are fixed between two equidistant points.

Augmented Dickey-Fuller test: A small p-value rejects the null hypothesis which assumes the data is non stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller,grangercausalitytests
from statsmodels.tools.eval_measures import mse, rmse, meanabs

In [None]:
def adf_test(series, title=''):
    print(f'Augmented Dickey-Fuller Test:{title}')
    adf_series=adfuller(series.dropna(), autolag='AIC')
    labels=['ADF Test Statistic', 'p-value','lags_used', '# of observations']
    adf_series_result=pd.Series(adf_series[0:4],index=labels)
    for k,v in adf_series[4].items():
        adf_series_result[f'critical value: ({k})']=v
    print(adf_series_result)
    if adf_series_result[1]<.05:
        print("Strong evidence against null hypothesis; Data is stationary")
    else:
        print("Weak evidence against null hypothesis; Data is non-stationary")

In [None]:
adf_test(df_r, 'Rivers')

In this case the p-value is very low and we reject the null hypothesis and the data is stationary.

# Granger Causality Tests

In [None]:
gct_df=river_a.copy()
gct_df.index=pd.to_datetime(gct_df.index)
cols=list(gct_df.columns)
cols.remove('Hydrometry_Nave_di_Rosano')
target='Hydrometry_Nave_di_Rosano'
for col in cols:
    gct_df[[col,target]].plot(figsize=(10,8))


In [None]:
for col in cols:
    print(f'Granger Causality Tests for {col}')
    grangercausalitytests(gct_df[[col,target]].dropna(), maxlag=3)
    print()

We are looking at extremely low p-values.

# Revealing seasonality with month plots and quarter plots

In [None]:
from statsmodels.graphics.tsaplots import month_plot, quarter_plot
df_r.index=pd.to_datetime(df_r.index)

In [None]:
# Resample to create monthly data
m_dfr=df_r.resample(rule='M').mean()
month_plot(m_dfr);

In [None]:
# Resample to create quarterly data
q_dfr=df_r.resample(rule='Q').mean()
quarter_plot(q_dfr);

# Lake

In [None]:
lake_b=pd.read_csv(os.path.join(base_dir,'Lake_Bilancino.csv'))
lake_b.head()

In [None]:
display_missing_val(find_missing_values(river_a),'Missing values for Lake_Bilancino')

In [None]:
lake_b.index=lake_b.Date
#lake_b.drop(columns=['Date'], axis=1, inplace=True)

In [None]:
analyse_heatmap(lake_b, 'Bilancino',.6)

In [None]:
def get_correlated_columns(corr):
    corr_pos=corr.abs()
    hi_corr_val=np.where(corr_pos>.6)
    hi_corr_val= [ (corr_pos.columns[x], corr_pos.columns[y]) for x, y in zip(*hi_corr_val) if x!=y and x<y]
    return hi_corr_val

In [None]:
lake_hicorrvals=get_correlated_columns(lake_b.corr())
print(lake_hicorrvals)

In [None]:
# plotting correlated features
def plot_correlated_features(wb,hi_corr_val, title=''):
    print(f'plot correlated features for {title}')
    wb.index=wb.Date
    df=wb.loc['01/01/2020':'01/05/2020']
    fig, ax=plt.subplots(5,len(hi_corr_val)//5, figsize=(20,20))
    ax=ax.ravel()
    for i in range(len(hi_corr_val[:15])):    
        ax[i]=df[[hi_corr_val[i][0],hi_corr_val[i][1],'Date']].plot(x='Date', ax=ax[i])
        ax[i].set_title(hi_corr_val[i][0]+' vs '+hi_corr_val[i][1])
    plt.subplots_adjust(wspace=.3, hspace=.3)
    plt.show()


In [None]:
plot_correlated_features(lake_b,lake_hicorrvals, 'Lake')

# WIP

In [None]:
adf_test(lake_b.Lake_Level, 'Lake')

In [None]:
adf_test(lake_b.Flow_Rate, 'Lake')

In [None]:
lake_b.index=pd.to_datetime(lake_b.index)

In [None]:
# Resample to create monthly data
m_lk_lvl=lake_b.Lake_Level.resample(rule='M').mean()
month_plot(m_lk_lvl);

In [None]:
# Resample to create quarterly data
q_lk_lvl=lake_b.Lake_Level.resample(rule='Q').mean()
quarter_plot(q_lk_lvl);

In [None]:
! pip install pmdarima

In [None]:
from pmdarima import auto_arima

In [None]:
auto_arima(lake_b.Lake_Level.dropna(),error_action='ignore').summary()