In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import boxcox

# plotting
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
#pd.options.plotting.backend = "plotly"

# settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (16, 8)

import tensorflow as tf

pd.options.display.max_columns = 500

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# References

Here you can find very useful resources and notebooks that I've read and used to build this notebook:

1. [Intro to Time Series Forecasting by Leonie](hhttps://www.kaggle.com/iamleonie/intro-to-time-series-forecasting)
2. [EDA: Quenching the Thirst for insights by Leonie](https://www.kaggle.com/iamleonie/eda-quenching-the-thirst-for-insights) 
3. [Acea - EDA: Deep Water by Luca31394](https://www.kaggle.com/luca31394/acea-eda-deep-water)
4. [(Arno) - EDA + Data Enrichment](https://www.kaggle.com/radema/arno-eda-data-enrichment)
5. [How to Develop Convolutional Neural Network Models for Time Series Forecasting by Machine Learning Mastery](https://machinelearningmastery.com/how-to-develop-convolutional-neural-network-models-for-time-series-forecasting/?fbclid=IwAR0tzQcUWavNeMXnor2AwUCMZzse6SkMqY5_m9uZ81_gjP2b7HB2jBrU03Q)
6. [How to use xgboost for Time Series Forecasting by Machine Learning Mastery](https://machinelearningmastery.com/xgboost-for-time-series-forecasting/?fbclid=IwAR0mL6rC6fpWtlw8JpL9WwyLPKETz589aleaLfkoXbMJiouZID5ekDZkvgE)

# Introduction

In this notebook I'll try to approach the Acea Smart Water Analytics competion. 
The Acea Group is one of the leading Italian multiutility operators. The company manages and develops water and electricity networks and environmental services. I'm a bit familiar with Acea and the names in the datasets since I live in Italy. 
\
In this notebook, I don't plan to address all the waterbodies but I plan to outline an precise pipeline (see their schema below):
* to analyze the different datasets grouped by waterbody type
* clean the data
* build new features
* train and validate forecasting model(s)
\
\
![image](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F6195295%2Fcca952eecc1e49c54317daf97ca2cca7%2FAcea-Input.png?generation=1606932492951317&alt=media)
\
\
I'll try to keep it as simple as possible. In particular I will start with simple EDAs, simple models and then try to add complexity. 

# Retrieve source datasets

Let's start retrieving the datasets. To differentatie the different sources by waterbody type, I've decided to organize the datasets in dictionaries.

In [None]:
from datetime import datetime
custom_date_parser = lambda x: datetime.strptime(x, '%d/%m/%Y')

df = {}
bodytype = {'Aquifer','Water','Lake','River'}
River = {}

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        if filename[-4:]=='.csv':
            if filename.startswith('River'):
                River[filename[:-4].replace('River_','')] = pd.read_csv(
                    os.path.join(dirname, filename)
                    ,parse_dates = ['Date'], date_parser = custom_date_parser
                )

In the following, you can find the details about the different waterbodies:
* River Arno: the Arno is the second largest river in peninsular Italy and the main waterway in Tuscany and it  has a relatively torrential regime, due to the nature of the surrounding soils (marl and impermeable clays). Output = Hydrometry
    
### River Arno

We start with the (maybe) simplest one: river Arno. Indeed, we have just one river and we need to forecast a single attribute (hydrometer). 

In [None]:
nulls = River['Arno'].groupby(
    River['Arno'].Date.dt.year
                             ).agg({'Date':'count',
                                  'Rainfall_Le_Croci':'count','Rainfall_Cavallina':'count', 'Rainfall_S_Agata':'count',
                                   'Rainfall_Mangona':'count', 'Rainfall_S_Piero':'count', 'Rainfall_Vernio':'count',
                                   'Rainfall_Stia':'count', 'Rainfall_Consuma':'count', 'Rainfall_Incisa':'count',
                                   'Rainfall_Montevarchi':'count', 'Rainfall_S_Savino':'count', 'Rainfall_Laterina':'count',
                                   'Rainfall_Bibbiena':'count', 'Rainfall_Camaldoli':'count', 'Temperature_Firenze':'count',
                                   'Hydrometry_Nave_di_Rosano':'count'
                                  }).transpose()
sns.heatmap(nulls)

From the previous pivot table, we see that:
* for year 1998, we have just the Hydrometry. All the metrics are null => I decide to drop this year
* from 1999 to 2003, we have values on temparature but not Rainfall(s) => for the moment, I want to drop these years
* several attributes for specific location are available only in a precise time range.

The last point is interesting. I can hypothesize the following explanations:
* the weather reports from these locations are outdated, so they are not used anymore for forecasting
* the weather reports are missing in this export, so we could try to retrieve these reports from open datasets

For the moment, since there're several years missing and the rainfalls are probably correlated, I prefer to focus on the following:
* Rainfall_Le_Croci	
* Rainfall_Cavallina
* Rainfall_S_Agata
* Rainfall_Mangona
* Rainfall_S_Piero
* Rainfall_Vernio

We have missing Temperature values (almost 2 years). However, I expect this attribute to have a strong seasonality. Let's see more deeply...
 

# Feature Engineering

In [None]:
def get_time_features(df):
    data = df.copy()
    #data.reset_index(inplace = True)
    data['year'] = data.Date.dt.year
    data['month'] = data.Date.dt.month
    data['day_in_year'] = data.Date.dt.dayofyear
    data['week_in_year'] = data.Date.dt.isocalendar().week.astype(int)
    data['year sin'] = np.sin(2*np.pi*data['day_in_year']/365.25)
    data['year cos'] = np.cos(2*np.pi*data['day_in_year']/365.25)
    return data

In [None]:
River['Arno'] = get_time_features(River['Arno'])

In [None]:
columns = [
    'Date',
    'Rainfall_Le_Croci','Rainfall_Cavallina',
    'Rainfall_S_Agata','Rainfall_Mangona','Rainfall_S_Piero',
#    'Rainfall',
    'Rainfall_Vernio','Rainfall_Incisa', 'Rainfall_Bibbiena',
    'Temperature_Firenze',
    'Hydrometry_Nave_di_Rosano'
]
features = [
    'Rainfall_Le_Croci','Rainfall_Cavallina',
    'Rainfall_S_Agata','Rainfall_Mangona','Rainfall_S_Piero',
#    'Rainfall',
    'Rainfall_Vernio','Rainfall_Incisa', 'Rainfall_Bibbiena',
    'Temperature_Firenze'
]

labels = [
    'Hydrometry_Nave_di_Rosano'
]
#River['Arno']['Rainfall'] = River['Arno'][['Rainfall_Le_Croci','Rainfall_Cavallina',
#    'Rainfall_S_Agata','Rainfall_Mangona','Rainfall_S_Piero']].sum(axis = 1)

data = River['Arno'][River['Arno'].year>2003][columns]


print(data.describe().transpose())
print('\n')
for column in columns[1:]:
    data.plot.line(x='Date',y=column,figsize = (10,4),title = column.replace('_',' '))

## (Temperature) Enrich datasets with external datasources

At some point in my analysis, I had the following idea: what if there's an open dataset on the Region [website](http://sir.toscana.it/idrometria-pub) about this data?
After a bit of research, I found them (at least I think so)! Be aware this data should be validated and evetually cleaned, however I want just to show you how we can use this external dataset to enrich ours. This method is much more simpler than build an LSTM model to...forecast missing data. 

To do: optimize the following cell

In [None]:
temperature = pd.read_csv("http://sir.toscana.it/archivio/download.php?IDST=termo_csv&IDS=TOS01001095", 
                          sep =';',skiprows=18,parse_dates=True, infer_datetime_format =True, dayfirst=True).rename(
    columns = {'gg/mm/aaaa':'Date', 'Max [°C]':'Max', 'Min [°C]':'Min'}
)
temperature.Date = pd.to_datetime(temperature.Date,format='%d/%m/%Y')

temperature['data_int'] = temperature.Date.dt.strftime('%Y%m%d').astype(int)
data['data_int'] = data.Date.dt.strftime('%Y%m%d').astype(int)
temperature['Avg'] = temperature.Max*0.65+temperature.Min*0.35

data = data.merge(temperature, on ='data_int', how = 'left')
data.Temperature_Firenze.fillna(data.Avg, inplace= True)
data.drop(['Avg','Min','Max','data_int','Date_y'], inplace=True,axis = 1)
data.rename(columns={'Date_x':'Date'}, inplace=True)
print(data.groupby(data.Date.dt.year).agg({'Date':'count', 'Temperature_Firenze':'count'
                                  }).transpose())
data.reset_index().plot.line('index','Temperature_Firenze')

In [None]:
def replace_implausible_zeros(col):
    temp_col = data[[col]].copy()
    temp_col['key'] = (temp_col[col] != temp_col[col].shift(1)).astype(int).cumsum()
    key_dict = temp_col.groupby(['key']).agg({col : 'mean', 'key':'count'})
    key_dict = key_dict[(key_dict[col]==0) & (key_dict.key >2)]
    for key in key_dict.index:
        temp_col[col] = np.where(temp_col['key'] == key, np.nan, temp_col[col]) 
    return temp_col[col]

params = {'num_leaves': 40,
          'objective': 'regression_l1',
          'max_depth': 4,
          'learning_rate': 0.001,
          "metric": 'mae',
          "verbosity": -1,
          'verbose': -1,
          'seed' : 42
         }

def predict_missing_temperature(data,target, labels):
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error
    import lightgbm as lgb
    """
    f, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 6))
    ax.set_title(f'Temperatures for {target}', fontsize=16)

    sns.lineplot(x=waterbodies_df.Date_dt, y=waterbodies_df[target], label=target, )
    sns.scatterplot(x=waterbodies_df.Date_dt, y=waterbodies_df[target].isna().apply(lambda x: 15 if x else np.nan), color='red', linewidth=0, label='To predict' )
    ax.set_xlim([date(2000, 1, 1), date(2020, 6, 30)])
    plt.show()
    """
    features = [feature for feature in data.columns if feature not in labels+['Date']]

    test_df = data.copy()
    train_df = data[data[target].notna()]

    features = [c for c in features if c != target]

    X = train_df[features]
    y = train_df[[target]]
    X_test = test_df[features]

    y_preds = np.zeros(X_test.shape[0])

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    
    dtrain = lgb.Dataset(X_train, y_train, params= {'verbose': -1})
    dvalid = lgb.Dataset(X_valid, y_valid, params= {'verbose': -1})

    # For analysis set 'verbose_eval' to 200, false
    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=False,  early_stopping_rounds=100)

    y_pred_valid = clf.predict(X_valid)
    y_preds = clf.predict(X_test)
    
    f, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 4))
    ax.set_title(f'Rolling Mean Temperatures for {target} \n(Rolling Window: 28 Days, MAE: {mean_absolute_error(y_valid.rolling(28).mean()[28:], pd.Series(y_pred_valid).rolling(28).mean()[28:])})', fontsize=16)
    old = data[target].copy().replace({np.nan : np.inf})
    data[target] = np.where(data[target].isna(), pd.Series(y_preds), data[target])
    sns.lineplot(x=data.Date, y=data[target].rolling(28).mean(), label='Imputed', color='darkorange')
    sns.lineplot(x=data.Date, y=old, label='Original', color='dodgerblue')
    ax.set_xlabel('Date', fontsize=14)
    ax.set_xlim([data.Date.to_list()[0],
                 data.Date.to_list()[-1]]
                 )
    plt.show()
    #return pd.Series(y_preds)

# Sort temperature columns according to amount of missing values.
# Fill NaN values for features with least missing values first

#example_col = 'Temperature_Velletri'
#predict_missing_temperature(example_col)

In [None]:
for col in features:
    data[col] = replace_implausible_zeros(col)
sorted_cols = data[features].isna().sum(axis=0).sort_values().index

for feature in sorted_cols:
    predict_missing_temperature(data, feature , labels)

In [None]:
nulls = data.groupby(
    data.Date.dt.year
                             ).agg('count'
                                  ).transpose()
sns.heatmap(nulls)
print('\n')
for column in columns[1:]:
    data.plot.line(x='Date',y=column,figsize = (10,4),title = column.replace('_',' '))

In [None]:
#Add seasonal components
from statsmodels.tsa import seasonal


decomposed = seasonal.seasonal_decompose(
    data['Temperature_Firenze'], period = 366, extrapolate_trend = 'freq'
)
decomposed.plot()
data['Temperature_Trend'] = decomposed.trend
data['Temperature_Season'] = decomposed.seasonal
data['Temperature_Resid'] = decomposed.resid

features = ['Temperature_Trend','Temperature_Season','Temperature_Resid'] + features

# Model Engineering (XGBoost)

In [None]:
start = data.Date.min()
end  = data.Date.max()
data.set_index('Date',inplace=True)

index_col = pd.date_range(start = start, end = end, freq = 'D')
data = data.reindex(index_col)

data['Hydrometry_Nave_di_Rosano'][data.Hydrometry_Nave_di_Rosano==0]=np.nan

In [None]:
data.reset_index().plot.line(x='index',y='Hydrometry_Nave_di_Rosano',figsize = (20,9),title = column.replace('_',' '))

In [None]:
#train = data[data.Hydrometry_Nave_di_Rosano.isna()==False]
train = pd.concat([data[labels]
           #,train[features].shift(0).add_suffix('_actual')
           ,data[features+labels].shift(1).add_suffix('_shifted_1')
           #,train[features+labels].shift(2).add_suffix('_shifted_2')
          ]
          ,axis = 1).dropna()#['Hydrometry_Nave_di_Rosano']
train['Hydrometry_Nave_di_Rosano'] = 1+  np.log(train.Hydrometry_Nave_di_Rosano)

In [None]:
train = get_time_features(train.reset_index().rename(columns={'index':'Date'}))
val = train[train.year>=2019]
train = train[train.year < 2019]
train.info()

In [None]:
# xgboost
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV


# define model
model = xgb.XGBRegressor()
model_features = ['Temperature_Season_shifted_1'
    ,'Rainfall_Le_Croci_shifted_1'
    ,'Rainfall_Cavallina_shifted_1'
    ,'Rainfall_S_Agata_shifted_1'
    ,'Rainfall_Mangona_shifted_1'
    ,'Rainfall_S_Piero_shifted_1'
    #,'Rainfall_Vernio_shifted_1'
    #,'Rainfall_Incisa_shifted_1'
    #,'Rainfall_Bibbiena_shifted_1'
    ,'Hydrometry_Nave_di_Rosano_shifted_1'
    ,'year sin'
    ,'year cos'
           ]
label = ['Hydrometry_Nave_di_Rosano']
X,Y = train[model_features],train[label]

#X_train, X_test,Y_train, Y_test =  train_test_split(X,Y, train_size = 0.8, shuffle = False)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_tweedie_deviance

print("Parameter optimization")
xgb_model = xgb.XGBRegressor(n_jobs=1)
clf = GridSearchCV(xgb_model,scoring = 'neg_mean_poisson_deviance',
                   param_grid = {'max_depth': [2, 4, 6],
                                 'n_estimators': [50, 100, 200], 
                                 'alpha': [0,0.01,0.001]
                                 ,'objective': ['reg:tweedie']
                                 ,'tweedie_variance_power':[1]
                                }, verbose=1, n_jobs=1, cv = KFold(n_splits=7, shuffle=False))
clf.fit(X.values, Y.values)
print(clf.best_score_)
print(clf.best_params_)
param = clf.best_params_

kf = KFold(n_splits=2, shuffle=False)
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor(**clf.best_params_, n_jobs=1).fit(X.iloc[train_index,:].values, Y.iloc[train_index,:].values)
    predictions = xgb_model.predict(X.iloc[test_index,:].values)
    actuals = Y.iloc[test_index,:].values
    print('MSE:\n')
    print(mean_squared_error(np.exp(actuals-1), np.exp(predictions-1)))
    print('MAE:\n')
    print(mean_absolute_error(np.exp(actuals-1), np.exp(predictions-1)))
    print('MAPE:\n')
    print(mean_tweedie_deviance(np.exp(actuals-1), np.exp(predictions-1),power = 1))
    
xgb.plot_importance(xgb_model)   

In [None]:
model_features = ['Temperature_Season_shifted_1'
    ,'Rainfall_Le_Croci_shifted_1'
    ,'Rainfall_Cavallina_shifted_1'
    ,'Rainfall_S_Agata_shifted_1'
    ,'Rainfall_Mangona_shifted_1'
    ,'Rainfall_S_Piero_shifted_1'
    #,'Rainfall_Vernio_shifted_1'
    #,'Rainfall_Incisa_shifted_1'
    #,'Rainfall_Bibbiena_shifted_1'
    ,'Hydrometry_Nave_di_Rosano_shifted_1'
    ,'year sin'
    ,'year cos'
           ]

# costruisco il dataset da quello reale
df = pd.concat([data[labels]
           #,train[features].shift(0).add_suffix('_actual')
           ,data[features+labels].shift(1).add_suffix('_shifted_1')
           #,train[features+labels].shift(2).add_suffix('_shifted_2')
          ]
          ,axis = 1)
df['Hydrometry_Nave_di_Rosano'] = 1+  np.log(df.Hydrometry_Nave_di_Rosano)
df = get_time_features(df.reset_index().rename(columns={'index':'Date'}))
#droppo la prima riga che avrà ,molti null
df = df.iloc[1:]
df['predicted'] = 0
for i,row in df.iterrows():
    if row['Hydrometry_Nave_di_Rosano_shifted_1'] == np.nan:
        row['Hydrometry_Nave_di_Rosano_shifted_1'] = df.iloc[i-1,'predicted']
    x = np.asarray(pd.DataFrame(row).T[model_features])
    df.loc[i,'predicted'] = xgb_model.predict(x)
    
# scorrendo iterativamente dalla prima riga a quella successiva proseguo come segue
## se Hydrometry shift 1 è null allora faccio la prediction tramite la il valore precedentemente predetto
df['Imputed'] = df.Hydrometry_Nave_di_Rosano.fillna(df.predicted)
df.plot.line(x = 'Date', y = ['predicted','Hydrometry_Nave_di_Rosano'])

In [None]:
fig, ax = plt.subplots()

fig1 = ax.bar(
    x = range(len(xgb_model.feature_importances_)),
    height = xgb_model.feature_importances_, 
    tick_label = model_features
)
ax.set_ylabel('Importances')
ax.set_title('Feature Importances')
plt.xticks(rotation = 45)

fig.tight_layout()

plt.show()

In [None]:
# Model Engineering (LSTM) (Removed)

In [None]:
data['Hydrometry_Nave_di_Rosano'].interpolate(method='spline',order = 5,inplace=True)
data.interpolate(method='linear',inplace=True)

In [None]:
data.reset_index().plot.line(x='index',y='Temperature_Firenze',figsize = (20,9),title = column.replace('_',' '))

In [None]:

decomposed = seasonal.seasonal_decompose(
    data['Temperature_Firenze'], period = 366, extrapolate_trend = 'freq'
)
decomposed.plot()
data['Temperature_Trend'] = decomposed.trend
data['Temperature_Season'] = decomposed.seasonal
data['Temperature_Resid'] = decomposed.resid
#data['Rainfall_Seasonal'] = d.seasonal
#data['Rainfall_Resid'] = d.resid


f, ax = plt.subplots(nrows=4, ncols=1, figsize=(15, 12))
f.suptitle('Seasonal Components of Features', fontsize=16)
sns.lineplot(x=data.reset_index().index, y=data.Hydrometry_Nave_di_Rosano, ax=ax[0], color='dodgerblue', label='Hydrometry')
ax[0].set_ylabel(ylabel='Hydrometry', fontsize=14)

sns.lineplot(x=data.reset_index().index, y=data.Temperature_Season, ax=ax[1], color='dodgerblue', label='Seasonal')
ax[1].set_ylabel(ylabel='Temperature', fontsize=14)

sns.lineplot(x=data.reset_index().index, y=data.Temperature_Trend, ax=ax[2], color='dodgerblue', label='Trend')
ax[2].set_ylabel(ylabel='Temperature', fontsize=14)

sns.lineplot(x=data.reset_index().index, y=data.Rainfall_Le_Croci, ax=ax[3], color='lightblue')
ax[3].set_ylabel(ylabel='Rainfall', fontsize=14)


for i in range(4):
    ax[i].set_xlim([list(data.reset_index().index)[0],list(data.reset_index().index)[-1]])
plt.tight_layout()
plt.show()

In [None]:
df = get_time_features(data.reset_index().rename(columns = {'index':'Date'}))
features = ['Hydrometry_Nave_di_Rosano',#'Rainfall_Seasonal','Rainfall_Resid',
            'Rainfall_Le_Croci', 'Rainfall_Cavallina', 'Rainfall_S_Agata',
            'Rainfall_Mangona', 'Rainfall_S_Piero',
            'Temperature_Season',
           'year sin','year cos'
           ]
df = df[df.year<2020][features]
df['Hydrometry_Nave_di_Rosano'] = 1 + np.log(df['Hydrometry_Nave_di_Rosano'])

#df.set_index('Date', inplace=True)

print(df.columns)
column_indices = {name: i for i, name in enumerate(df.columns)}
num_features = len(column_indices)
n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

rainfall_columns = [
    'Rainfall_Le_Croci', 'Rainfall_Cavallina', 'Rainfall_S_Agata',
       'Rainfall_Mangona', 'Rainfall_S_Piero',
    'Hydrometry_Nave_di_Rosano',
    #'Rainfall_Resid',
]

num_features = df.shape[1]

#train_df[rainfall_columns] = train_df[rainfall_columns].apply(lambda x: np.power(x,1/4))
#val_df[rainfall_columns] = val_df[rainfall_columns].apply(lambda x: np.power(x,1/4))
#test_df[rainfall_columns] = test_df[rainfall_columns].apply(lambda x: np.power(x,1/4))

for feature in rainfall_columns:
    train_df[feature] = boxcox(train_df[feature] + 0.5,-1)
    val_df[feature] = boxcox(val_df[feature] + 0.5,-1)
    test_df[feature] = boxcox(test_df[feature] + 0.5,-1)

train_mean = train_df.mean()
train_std= train_df.std()



train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

df_std = (df - train_mean) / train_std
df_std = df_std.melt(var_name='Column', value_name='Normalized')
plt.figure(figsize=(12, 6))
ax = sns.violinplot(x='Column', y='Normalized', data=df_std)
_ = ax.set_xticklabels(df.keys(), rotation=90)

In [None]:
## Forecasting

In this section we focus on building a forecasting model for our scenario. I've decided to use Tensorflow (because I'm studying it). In particular, I've largely used and experimented what is written in the [official tutorial](https://www.tensorflow.org/tutorials/structured_data/time_series). The following functions and classes are retrieved from this tutorial. 

I want to build a single step model to predict what happens netxt and a multi-step models, i.e. given data about past events, I want to predict a sequence of future values.

In [None]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift,
                 train_df=train_df, val_df=val_df, test_df=test_df,
                 label_columns=None):
        # Store the raw data.
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])
    
def split_window(self, features):
    inputs = features[:, self.input_slice, :]
    labels = features[:, self.labels_slice, :]
    if self.label_columns is not None:
        labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_columns],
            axis=-1)

      # Slicing doesn't preserve static shape information, so set the shapes
      # manually. This way the `tf.data.Datasets` are easier to inspect.
    inputs.set_shape([None, self.input_width, None])
    labels.set_shape([None, self.label_width, None])
    
    return inputs, labels
    
WindowGenerator.split_window = split_window

def plot(self, model=None, plot_col='Hydrometry_Nave_di_Rosano', max_subplots=3):
    inputs, labels = self.example
    plt.figure(figsize=(12, 8))
    plot_col_index = self.column_indices[plot_col]
    max_n = min(max_subplots, len(inputs))
    for n in range(max_n):
        plt.subplot(3, 1, n+1)
        plt.ylabel(f'{plot_col} [normed]')
        plt.plot(self.input_indices, inputs[n, :, plot_col_index],
                 label='Inputs', marker='.', zorder=-10)
        
        if self.label_columns:
            label_col_index = self.label_columns_indices.get(plot_col, None)
        else:
            label_col_index = plot_col_index

        if label_col_index is None:
            continue

        plt.scatter(self.label_indices, labels[n, :, label_col_index],
                    edgecolors='k', label='Labels', c='#2ca02c', s=64)
        if model is not None:
            predictions = model(inputs)
            plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                        marker='X', edgecolors='k', label='Predictions',
                        c='#ff7f0e', s=64)

        if n == 0:
            plt.legend()

    plt.xlabel('Time')

WindowGenerator.plot = plot

def make_dataset(self, data):
    data = np.array(data, dtype=np.float32)
    ds = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=data,
        targets=None,
        sequence_length=self.total_window_size,
        sequence_stride=1,
        shuffle=True,
        batch_size=128)

    ds = ds.map(self.split_window)

    return ds

WindowGenerator.make_dataset = make_dataset

@property
def train(self):
    return self.make_dataset(self.train_df)

@property
def val(self):
    return self.make_dataset(self.val_df)

@property
def test(self):
    return self.make_dataset(self.test_df)

@property
def example(self):
    """Get and cache an example batch of `inputs, labels` for plotting."""
    result = getattr(self, '_example', None)
    if result is None:
        # No example batch was found, so get one from the `.train` dataset
        result = next(iter(self.train))
        # And cache it for next time
        self._example = result
    return result

WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test
WindowGenerator.example = example

MAX_EPOCHS = 50

def compile_and_fit(model, window, learning_rate = 0.01, epochs = MAX_EPOCHS, patience=2):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')

    model.compile(loss=tf.losses.MAE
                  ,
                optimizer=tf.optimizers.Adam(learning_rate = learning_rate),
                metrics=[tf.metrics.MeanAbsoluteError()
                         ,tf.metrics.MeanAbsolutePercentageError()
                         ,tf.metrics.Poisson()
                        ])

    history = model.fit(window.train, epochs=epochs,
                      validation_data=window.val,
                      callbacks=[early_stopping])
    
    x = pd.DataFrame(history.history)
    x.plot.line(y=['loss','val_loss'],figsize = (5,3))
    x.plot.line(y=['mean_absolute_error','val_mean_absolute_error'],figsize = (5,3))
    x.plot.line(y=['mean_absolute_percentage_error','val_mean_absolute_percentage_error'],figsize = (5,3))
    x.plot.line(y=['poisson','val_poisson'],figsize = (5,3))
    
    del x
    
    return history

In [None]:
class Baseline(tf.keras.Model):
    def __init__(self, label_index=None):
        super().__init__()
        self.label_index = label_index
    def call(self, inputs):
        if self.label_index is None:
            return inputs
        result = inputs[:, :, self.label_index]
        return result[:, :, tf.newaxis]

In [None]:
single_step_window = WindowGenerator(
    input_width=3, label_width=3, shift=1,
    label_columns=['Hydrometry_Nave_di_Rosano'])
#single_step_window

baseline = Baseline(label_index=column_indices['Hydrometry_Nave_di_Rosano'])

baseline.compile(loss=tf.losses.MAE,
                 metrics=[tf.metrics.MeanAbsoluteError(), tf.metrics.MeanAbsolutePercentageError()])

val_performance = {}
performance = {}
val_performance['Baseline'] = baseline.evaluate(single_step_window.val)
performance['Baseline'] = baseline.evaluate(single_step_window.test, verbose=0)

In [None]:
wide_window = WindowGenerator(
    input_width=1, label_width=1, shift=1,
    label_columns=['Hydrometry_Nave_di_Rosano'])

lstm_model = tf.keras.models.Sequential([
    # Shape [batch, time, features] => [batch, time, lstm_units]
    tf.keras.layers.LSTM(32, return_sequences=True),
    # Shape => [batch, time, features]
    tf.keras.layers.Dense(units=1)
])

history = compile_and_fit(lstm_model, wide_window, learning_rate = 0.001)

val_performance['LSTM'] = lstm_model.evaluate(wide_window.val)
performance['LSTM'] = lstm_model.evaluate(wide_window.test, verbose=0)

wide_window.plot(lstm_model)

In [None]:

x = np.arange(len(performance))
width = 0.3
metric_name = 'mean_absolute_error'
metric_index = lstm_model.metrics_names.index('mean_absolute_error')
val_mae = [v[metric_index] for v in val_performance.values()]
test_mae = [v[metric_index] for v in performance.values()]

plt.ylabel('mean_absolute_error [Hydrometry, normalized]')
plt.bar(x - 0.17, val_mae, width, label='Validation')
plt.bar(x + 0.17, test_mae, width, label='Test')
plt.xticks(ticks=x, labels=performance.keys(),
           rotation=45)
_ = plt.legend()


for name, value in performance.items():
    print(f'{name:12s}: {value[1]:0.4f}')

In [None]:
plt.bar(x = range(0,lstm_model.layers[0].weights[0].numpy().shape[0]),
        height = np.average(lstm_model.layers[0].weights[0].numpy(),axis = 1), 
        yerr = np.var(lstm_model.layers[0].weights[0].numpy(),axis = 1),
       align='center', ecolor='black', capsize=10)
axis = plt.gca()
axis.set_xticks(range(len(train_df.columns)))
_ = axis.set_xticklabels(train_df.columns, rotation=90)