In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import boxcox

# plotting
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
#pd.options.plotting.backend = "plotly"

# settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (16, 8)

import tensorflow as tf

pd.options.display.max_columns = 500

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

In this notebook I'll try to approach the Acea Smart Water Analytics competion. 
The Acea Group is one of the leading Italian multiutility operators. The company manages and develops water and electricity networks and environmental services. I'm a bit familiar with Acea and the names in the datasets since I live in Italy. 
\
In this notebook, I don't plan to address all the waterbodies but I plan to outline an precise pipeline (see their schema below):
* to analyze the different datasets grouped by waterbody type
* clean the data
* build new features
* train and validate forecasting model(s)
\
\
![image](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F6195295%2Fcca952eecc1e49c54317daf97ca2cca7%2FAcea-Input.png?generation=1606932492951317&alt=media)
\
\
I'll try to keep it as simple as possible. In particular I will start with simple EDAs, simple models and then try to add complexity. 

# Retrieve source datasets

Let's start retrieving the datasets. To differentatie the different sources by waterbody type, I've decided to organize the datasets in dictionaries.

In [None]:
from datetime import datetime
custom_date_parser = lambda x: datetime.strptime(x, '%d/%m/%Y')

df = {}
bodytype = {'Aquifer','Water','Lake','River'}
Aquifer = {}
Water = {}
Lake = {}
River = {}

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        if filename[-4:]=='.csv':
            if filename.startswith('Aquifer'):
                Aquifer[filename[:-4].replace('Aquifer_','')] = pd.read_csv(
                    os.path.join(dirname, filename)#,parse_dates = ['Date']#, infer_datetime_format = True
                )
            elif filename.startswith('Water'):
                Water[filename[:-4].replace('Water_Spring_','')] = pd.read_csv(
                    os.path.join(dirname, filename)
                    #,parse_dates = ['Date']#, infer_datetime_format = True
                )
            elif filename.startswith('Lake'):
                Lake[filename[:-4].replace('Lake_','')] = pd.read_csv(
                    os.path.join(dirname, filename)
                   ,parse_dates = ['Date']#, infer_datetime_format = True
                )
            elif filename.startswith('River'):
                River[filename[:-4].replace('River_','')] = pd.read_csv(
                    os.path.join(dirname, filename)
                    ,parse_dates = ['Date'], date_parser = custom_date_parser
                )
                
                    
            #df[filename[:-4]] = pd.read_csv(
            #    os.path.join(dirname, filename),
            #    parse_dates = ['Date'], infer_datetime_format = True
            #)
            print('\nDone File: ',filename[:-4])
            #print(df[filename[:-4]].info())

Here we have all the available Acquifers, Waters, Lake (just one) and River (just one).

In [None]:
print('Available Aquifers:')
for key in Aquifer.keys():
    print(key)
    #print(Aquifer[key].info())
print('\n')
print('Available Waters:')
for key in Water.keys():
    print(key)
print('\n')
print('Available Lakes:')
for key in Lake.keys():
    print(key)
print('\n')
print('Available River:')
for key in River.keys():
    print(key)

In the following, you can find the details about the different waterbodies:
* River Arno: the Arno is the second largest river in peninsular Italy and the main waterway in Tuscany and it  has a relatively torrential regime, due to the nature of the surrounding soils (marl and impermeable clays). Output = Hydrometry
* Lake Bilancino: it is an artificial lake in Mugello, in the province of Florence. It has a maximum depth of thirty-one metres and a surface area of 5 square kilometres. Output(s): Lake level, flow rate
* Aquifer: Output(s): Depth to Groundwater
    * Auser: this water body consists of two subsystems, that we call NORH and SOUTH, where the former partly influences the behaviour of the latter. The levels of the NORTH sector are represented by the values of the SAL, PAG, CoS and DIEC wells, while the levels of the SOUTH sector by the LT2 well.
    * Luco: it is an underground aquifer not fed by rivers or lakes but fed by meteoric infiltration and it is accessed through wells called Pozzo_1, Pozzo_3 and Pozzo_4.
    * Petrignano: it is fed by three underground aquifers separated by low permeability septa; the water table can be considered groundwater and is also fed by the Chiascio river.
    * Doganella: the Doganella well field is fed by two underground aquifers: the upper stratum is a water table with a thickness of about 30m while the lower one is a semi-confined artesian aquifer with a thickness of 50m.
* WaterSprings: Output(s): Flow rate
    * Amiata : This aquifer is accessed through the Ermicciolo, Arbure, Bugnano and Galleria Alta springs. The levels and volumes of the four springs are influenced by the parameters: pluviometry, sub-gradation, hydrometry, temperatures and drainage volumes.
    * Madonna di Canneto: The Madonna di Canneto spring is situated at an altitude of 1010m above sea level in the Canneto valley. It does not consist of an aquifer and its source is supplied by the water catchment area of the river Melfa
    * Lupa: It is located in the Arrone area and is used for drinking use.

### River Arno

We start with the (maybe) simplest one: river Arno. Indeed, we have just one river and we need to forecast a single attribute (hydrometer). 

In [None]:
River['Arno'].info()

In [None]:
River['Arno'].describe().transpose()

In [None]:
River['Arno']['yyyymm'] = (River['Arno'].Date.dt.year)*100 + River['Arno'].Date.dt.month
River['Arno']['yyyy'] = River['Arno'].Date.dt.year

In [None]:
River['Arno'].groupby('yyyy').agg({'Date':'count',
                                  'Rainfall_Le_Croci':'count','Rainfall_Cavallina':'count', 'Rainfall_S_Agata':'count',
                                   'Rainfall_Mangona':'count', 'Rainfall_S_Piero':'count', 'Rainfall_Vernio':'count',
                                   'Rainfall_Stia':'count', 'Rainfall_Consuma':'count', 'Rainfall_Incisa':'count',
                                   'Rainfall_Montevarchi':'count', 'Rainfall_S_Savino':'count', 'Rainfall_Laterina':'count',
                                   'Rainfall_Bibbiena':'count', 'Rainfall_Camaldoli':'count', 'Temperature_Firenze':'count',
                                   'Hydrometry_Nave_di_Rosano':'count'
                                  }).transpose()

From the previous pivot table, we see that:
* for year 1998, we have just the Hydrometry. All the metrics are null => I decide to drop this year
* from 1999 to 2003, we have values on temparature but not Rainfall(s) => for the moment, I want to drop these years
* several attributes for specific location are available only in a precise time range.

The last point is interesting. I can hypothesize the following explanations:
* the weather reports from these locations are outdated, so they are not used anymore for forecasting
* the weather reports are missing in this export, so we could try to retrieve these reports from open datasets

For the moment, since there're several years missing and the rainfalls are probably correlated, I prefer to focus on the following:
* Rainfall_Le_Croci	
* Rainfall_Cavallina
* Rainfall_S_Agata
* Rainfall_Mangona
* Rainfall_S_Piero
* Rainfall_Vernio

We have missing Temperature values (almost 2 years). However, I expect this attribute to have a strong seasonality. Let's see more deeply...
 

In [None]:
columns = [
    'Date','Rainfall_Le_Croci','Rainfall_Cavallina',
    'Rainfall_S_Agata','Rainfall_Mangona','Rainfall_S_Piero',
    'Temperature_Firenze',
    'Hydrometry_Nave_di_Rosano'
]


data = River['Arno'][River['Arno'].yyyy>2003][columns]

data['Rainfall'] = data[['Rainfall_Le_Croci','Rainfall_Cavallina',
    'Rainfall_S_Agata','Rainfall_Mangona','Rainfall_S_Piero',]].sum(axis = 1)

print(data.describe().transpose())
print('\n')
for column in columns[1:]:
    data.plot.line(x='Date',y=column,figsize = (20,9),title = column.replace('_',' '))


From the previous plots we see that:
* Temperature and Hydrometry are strongly seasonal (and probably correlated)
* We could "forecast" the temperature for the missing years
* The hydrometer seems to have some "wrong" values equal to 0. 

In [None]:
data.info()

In [None]:
sns.lineplot( 
             x=data.Date.dt.dayofyear, 
             y=data.Temperature_Firenze, 
             hue=data.Date.dt.year, 
             legend='full')

# add title
plt.title('Seasonal plot')

# move the legend outside of the main figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2);

In [None]:
sns.lineplot( 
             x=data.index, 
             y=data.Hydrometry_Nave_di_Rosano, 
             hue=data.Date.dt.year, 
             legend='full')

# add title
plt.title('Seasonal plot')

# move the legend outside of the main figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2);

In [None]:
data.info()

### Enrich datasets with external datasources

At some point in my analysis, I had the following idea: what if there's an open dataset on the Region [website](http://sir.toscana.it/idrometria-pub) about this data?
After a bit of research, I found them (at least I think so)! Be aware this data should be validated and evetually cleaned, however I want just to show you how we can use this external dataset to enrich ours. This method is much more simpler than build an LSTM model to...forecast missing data. 

In [None]:
temperature = pd.read_csv("http://sir.toscana.it/archivio/download.php?IDST=termo_csv&IDS=TOS01001095", 
                          sep =';',skiprows=18,parse_dates=True, infer_datetime_format =True, dayfirst=True).rename(
    columns = {'gg/mm/aaaa':'Date', 'Max [°C]':'Max', 'Min [°C]':'Min'}
)
temperature.Date = pd.to_datetime(temperature.Date,format='%d/%m/%Y')
temperature

In [None]:
temperature['data_int'] = temperature.Date.dt.strftime('%Y%m%d').astype(int)
data['data_int'] = data.Date.dt.strftime('%Y%m%d').astype(int)

temperature['Avg'] = temperature.Max*0.65+temperature.Min*0.35
#x = pd.merge(data,temperature, on ='Date',how = 'left')

In [None]:
data = data.merge(temperature, on ='data_int', how = 'left')
data.Temperature_Firenze.fillna(data.Avg, inplace= True)
data.drop('Date_y', inplace=True,axis = 1)
data.rename(columns={'Date_x':'Date'}, inplace=True)
print(data.groupby(data.Date.dt.year).agg({'Date':'count', 'Temperature_Firenze':'count'
                                  }).transpose())
data.reset_index().plot.line('index','Temperature_Firenze')

In [None]:
data['Year sin'] = np.sin((data.Date.dt.dayofyear/365.2425)*2*np.pi)
data['Year cos'] = np.cos((data.Date.dt.dayofyear/365.2425)*2*np.pi)
data['Year'] = data.Date.dt.year
#data.plot.line(x='Date', y=['Year sin','Year cos'])

In [None]:
start = data.Date.min()
end  = data.Date.max()
data.set_index('Date',inplace=True)

index_col = pd.date_range(start = start, end = end, freq = 'D')
data = data.reindex(index_col)

In [None]:
data['Hydrometry_Nave_di_Rosano'][data.Hydrometry_Nave_di_Rosano==0]=np.nan
#data = data.assign(RollingMedian=data.Hydrometry_Nave_di_Rosano.fillna(data.Hydrometry_Nave_di_Rosano.rolling(7,min_periods=1,).median()))
data['Hydrometry_Nave_di_Rosano'].interpolate(method='spline',order = 5,inplace=True)
data.interpolate(method='linear',inplace=True)

# this method is rather stupid but it's fast and quite simple. I'll try to build something better in a second step. 
#data['Temperature_Firenze'].fillna(data['Temperature_Firenze'].shift(periods=365*4), inplace=True)


data.reset_index().plot.line(x='index',y='Hydrometry_Nave_di_Rosano',figsize = (20,9),title = column.replace('_',' '))

In [None]:
data.reset_index().plot.line(x='index',y='Temperature_Firenze',figsize = (20,9),title = column.replace('_',' '))

In [None]:
pd.plotting.autocorrelation_plot(data['Temperature_Firenze'])

## Last Point EDA

In [None]:
data.drop(['data_int','Max','Min','Avg','Year'
           ,'Rainfall_Le_Croci','Rainfall_Cavallina',
    'Rainfall_S_Agata','Rainfall_Mangona','Rainfall_S_Piero'
          ],axis = 1,inplace = True)
pd.plotting.scatter_matrix(data, figsize = (20,16))

In [None]:
df = data[data.index.year<2020]
#df.set_index('Date', inplace=True)
print(df.columns)
column_indices = {name: i for i, name in enumerate(df.columns)}
num_features = len(column_indices)
n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

rainfall_columns = [
    #'Rainfall_Le_Croci', 'Rainfall_Cavallina', 'Rainfall_S_Agata',
    #   'Rainfall_Mangona', 'Rainfall_S_Piero',
    'Hydrometry_Nave_di_Rosano',
    'Rainfall'
]

num_features = df.shape[1]

train_df[rainfall_columns] = train_df[rainfall_columns].apply(lambda x: np.power(x,1/4))
val_df[rainfall_columns] = val_df[rainfall_columns].apply(lambda x: np.power(x,1/4))
test_df[rainfall_columns] = test_df[rainfall_columns].apply(lambda x: np.power(x,1/4))

for feature in rainfall_columns:
    train_df[feature] = boxcox(train_df[feature] + 0.5,-1)
    val_df[feature] = boxcox(val_df[feature] + 0.5,-1)
    test_df[feature] = boxcox(test_df[feature] + 0.5,-1)

train_mean = train_df.mean()
train_std= train_df.std()



train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

df_std = (df - train_mean) / train_std
df_std = df_std.melt(var_name='Column', value_name='Normalized')
plt.figure(figsize=(12, 6))
ax = sns.violinplot(x='Column', y='Normalized', data=df_std)
_ = ax.set_xticklabels(df.keys(), rotation=90)

# Forecasting Models

In this section we focus on building a forecasting model for our scenario. I've decided to use Tensorflow (because I'm studying it). In particular, I've largely used and experimented what is written in the [official tutorial](https://www.tensorflow.org/tutorials/structured_data/time_series). The following functions and classes are retrieved from this tutorial. 

I want to build a single step model to predict what happens netxt and a multi-step models, i.e. given data about past events, I want to predict a sequence of future values.

In [None]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift,
                 train_df=train_df, val_df=val_df, test_df=test_df,
                 label_columns=None):
        # Store the raw data.
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

In [None]:
def split_window(self, features):
    inputs = features[:, self.input_slice, :]
    labels = features[:, self.labels_slice, :]
    if self.label_columns is not None:
        labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_columns],
            axis=-1)

  # Slicing doesn't preserve static shape information, so set the shapes
  # manually. This way the `tf.data.Datasets` are easier to inspect.
    inputs.set_shape([None, self.input_width, None])
    labels.set_shape([None, self.label_width, None])
    
    return inputs, labels

WindowGenerator.split_window = split_window

In [None]:
def plot(self, model=None, plot_col='Hydrometry_Nave_di_Rosano', max_subplots=3):
    inputs, labels = self.example
    plt.figure(figsize=(12, 8))
    plot_col_index = self.column_indices[plot_col]
    max_n = min(max_subplots, len(inputs))
    for n in range(max_n):
        plt.subplot(3, 1, n+1)
        plt.ylabel(f'{plot_col} [normed]')
        plt.plot(self.input_indices, inputs[n, :, plot_col_index],
                 label='Inputs', marker='.', zorder=-10)
        
        if self.label_columns:
            label_col_index = self.label_columns_indices.get(plot_col, None)
        else:
            label_col_index = plot_col_index

        if label_col_index is None:
            continue

        plt.scatter(self.label_indices, labels[n, :, label_col_index],
                    edgecolors='k', label='Labels', c='#2ca02c', s=64)
        if model is not None:
            predictions = model(inputs)
            plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                        marker='X', edgecolors='k', label='Predictions',
                        c='#ff7f0e', s=64)

        if n == 0:
            plt.legend()

    plt.xlabel('Time')

WindowGenerator.plot = plot

In [None]:
def make_dataset(self, data):
    data = np.array(data, dtype=np.float32)
    ds = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=data,
        targets=None,
        sequence_length=self.total_window_size,
        sequence_stride=1,
        shuffle=True,
        batch_size=256)

    ds = ds.map(self.split_window)

    return ds

WindowGenerator.make_dataset = make_dataset

In [None]:
@property
def train(self):
    return self.make_dataset(self.train_df)

@property
def val(self):
    return self.make_dataset(self.val_df)

@property
def test(self):
    return self.make_dataset(self.test_df)

@property
def example(self):
    """Get and cache an example batch of `inputs, labels` for plotting."""
    result = getattr(self, '_example', None)
    if result is None:
        # No example batch was found, so get one from the `.train` dataset
        result = next(iter(self.train))
        # And cache it for next time
        self._example = result
    return result

WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test
WindowGenerator.example = example

In [None]:
MAX_EPOCHS = 50

def compile_and_fit(model, window, learning_rate = 0.001, epochs = MAX_EPOCHS, patience=2):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')

    model.compile(loss=tf.losses.MeanSquaredError(),
                optimizer=tf.optimizers.Adam(learning_rate = learning_rate),
                metrics=[tf.metrics.MeanAbsoluteError(),tf.metrics.MeanSquaredError()])

    history = model.fit(window.train, epochs=epochs,
                      validation_data=window.val,
                      callbacks=[early_stopping])
    
    x = pd.DataFrame(history.history)
    x.plot.line(y=['loss','val_loss'],figsize = (5,3))
    x.plot.line(y=['mean_absolute_error','val_mean_absolute_error'],figsize = (5,3))
    
    del x
    
    return history

In [None]:
OUT_STEPS = 3
w1 = WindowGenerator(input_width=7, label_width=OUT_STEPS, shift=3
                     #,label_columns=['Hydrometry_Nave_di_Rosano']
                    )
w1

In [None]:
multi_val_performance = {}
multi_performance = {}

## Multi-step models

### Baseline Models

In this subsection, we build a baseline model which predict the future values by past duplication.

In [None]:
class MultiStepLastBaseline(tf.keras.Model):
    def call(self, inputs):
        return tf.tile(inputs[:, -1:, :], [1, OUT_STEPS, 1])

last_baseline = MultiStepLastBaseline()
last_baseline.compile(loss=tf.losses.MeanSquaredError(),
                      metrics=[tf.metrics.MeanAbsoluteError(), tf.metrics.MeanSquaredError()])

multi_val_performance['Last'] = last_baseline.evaluate(w1.val)
multi_performance['Last'] = last_baseline.evaluate(w1.test, verbose=0)
w1.plot(last_baseline)

### Multistep LSTM Model

In [None]:
multi_lstm_model = tf.keras.Sequential([
    # Shape [batch, time, features] => [batch, lstm_units]
    # Adding more `lstm_units` just overfits more quickly.
    tf.keras.layers.LSTM(32, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    # Shape => [batch, out_steps*features]
    tf.keras.layers.Dense(OUT_STEPS*num_features,
                          kernel_initializer=tf.initializers.zeros),
    # Shape => [batch, out_steps, features]
    tf.keras.layers.Reshape([OUT_STEPS, num_features])
])

history = compile_and_fit(multi_lstm_model, w1)

multi_val_performance['M-LSTM'] = multi_lstm_model.evaluate(w1.val)
multi_performance['M-LSTM'] = multi_lstm_model.evaluate(w1.test, verbose=0)
w1.plot(multi_lstm_model)

### Single Step Models

In [None]:
class Baseline(tf.keras.Model):
    def __init__(self, label_index=None):
        super().__init__()
        self.label_index = label_index
    def call(self, inputs):
        if self.label_index is None:
            return inputs
        result = inputs[:, :, self.label_index]
        return result[:, :, tf.newaxis]

In [None]:
single_step_window = WindowGenerator(
    input_width=7, label_width=1, shift=1,
    label_columns=['Hydrometry_Nave_di_Rosano'])
#single_step_window

baseline = Baseline(label_index=column_indices['Hydrometry_Nave_di_Rosano'])

baseline.compile(loss=tf.losses.MeanSquaredError(),
                 metrics=[tf.metrics.MeanAbsoluteError(),tf.metrics.MeanSquaredError()])

val_performance = {}
performance = {}
val_performance['Baseline'] = baseline.evaluate(single_step_window.val)
performance['Baseline'] = baseline.evaluate(single_step_window.test, verbose=0)

In [None]:
CONV_WIDTH = 2
conv_window = WindowGenerator(
    input_width=CONV_WIDTH,
    label_width=1,
    shift=1,
    label_columns=['Hydrometry_Nave_di_Rosano'])

conv_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=16,
                           kernel_size=(CONV_WIDTH,),
                           activation='relu'),
    #tf.keras.layers.Dense(units=10, activation='relu'),
    tf.keras.layers.Dense(units=1),
])

history = compile_and_fit(conv_model, conv_window)

val_performance['Conv'] = conv_model.evaluate(conv_window.val)
performance['Conv'] = conv_model.evaluate(conv_window.test, verbose=0)

In [None]:
SHIFT = 1
CONV_WIDTH = 2
LABEL_WIDTH = SHIFT
INPUT_WIDTH = LABEL_WIDTH + (CONV_WIDTH - 1)
wide_conv_window = WindowGenerator(
    input_width=INPUT_WIDTH,
    label_width=LABEL_WIDTH,
    shift=SHIFT,
    label_columns=['Hydrometry_Nave_di_Rosano'])

wide_conv_window.plot(conv_model)

In [None]:
wide_window = WindowGenerator(
    input_width=21, label_width=21, shift=1,
    label_columns=['Hydrometry_Nave_di_Rosano'])

lstm_model = tf.keras.models.Sequential([
    # Shape [batch, time, features] => [batch, time, lstm_units]
    tf.keras.layers.LSTM(64, return_sequences=True),
    # Shape => [batch, time, features]
    tf.keras.layers.Dense(units=1)
])

history = compile_and_fit(lstm_model, wide_window, learning_rate = 0.003)

val_performance['LSTM'] = lstm_model.evaluate(wide_window.val)
performance['LSTM'] = lstm_model.evaluate(wide_window.test, verbose=0)

wide_window.plot(lstm_model)

In [None]:
linear = tf.keras.Sequential([
    tf.keras.layers.Dense(units=1)
])

history = compile_and_fit(linear, single_step_window, epochs = 100, learning_rate = 0.01)

val_performance['Linear'] = linear.evaluate(single_step_window.val)
performance['Linear'] = linear.evaluate(single_step_window.test, verbose=0)

In [None]:
plt.bar(x = range(len(train_df.columns)),
        height=linear.layers[0].kernel[:,0].numpy())
axis = plt.gca()
axis.set_xticks(range(len(train_df.columns)))
_ = axis.set_xticklabels(train_df.columns, rotation=90)

In [None]:
x = np.arange(len(performance))
width = 0.3
metric_name = 'mean_absolute_error'
metric_index = lstm_model.metrics_names.index('mean_absolute_error')
val_mae = [v[metric_index] for v in val_performance.values()]
test_mae = [v[metric_index] for v in performance.values()]

plt.ylabel('mean_absolute_error [Hydrometry, normalized]')
plt.bar(x - 0.17, val_mae, width, label='Validation')
plt.bar(x + 0.17, test_mae, width, label='Test')
plt.xticks(ticks=x, labels=performance.keys(),
           rotation=45)
_ = plt.legend()

In [None]:
for name, value in performance.items():
    print(f'{name:12s}: {value[1]:0.4f}')

1. # An interpretable model

We want to understand which is the weight for each feature using the lstm model. Similarly to the linear model, we can study the weights (mean and variance) from the first layer of the LSTM model.

In [None]:
plt.bar(x = range(0,lstm_model.layers[0].weights[0].numpy().shape[0]),
        height = np.average(lstm_model.layers[0].weights[0].numpy(),axis = 1), 
        yerr = np.var(lstm_model.layers[0].weights[0].numpy(),axis = 1),
       align='center', ecolor='black', capsize=10)
axis = plt.gca()
axis.set_xticks(range(len(train_df.columns)))
_ = axis.set_xticklabels(train_df.columns, rotation=90)

In [None]:
df = data[data.index.year<2020]
#df.set_index('Date', inplace=True)
print(df.columns)
column_indices = {name: i for i, name in enumerate(df.columns)}
num_features = len(column_indices)
n = len(df)

rainfall_columns = [
    #'Rainfall_Le_Croci', 'Rainfall_Cavallina', 'Rainfall_S_Agata',
    #   'Rainfall_Mangona', 'Rainfall_S_Piero',
    'Rainfall',
    'Hydrometry_Nave_di_Rosano'
]

num_features = df.shape[1]

df[rainfall_columns] = df[rainfall_columns].apply(lambda x: np.power(x,1/4))

for feature in rainfall_columns:
    df[feature] = boxcox(df[feature] + 0.5,-1)

df_std = (df - train_mean) / train_std

x = lstm_model.predict(wide_window.make_dataset(df_std))[:,-1,0]

x = x*train_std['Hydrometry_Nave_di_Rosano'] + train_mean['Hydrometry_Nave_di_Rosano']

p = len(x)

lstm_df = df[-p:]
lstm_df['predict'] = x

In [None]:
lstm_df[:800].reset_index().plot.line(x = 'index',y=['Hydrometry_Nave_di_Rosano','predict'])

In [None]:
lstm_df.corr()

In [None]:
River['Arno'].corr()

In [None]:
River['Arno']['Rainfall_Zona_A'] = River['Arno'][['Rainfall_Le_Croci','Rainfall_Cavallina','Rainfall_S_Agata'
               ,'Rainfall_Mangona','Rainfall_S_Piero','Rainfall_Vernio']].sum(axis = 1)
River['Arno']['Rainfall_Zona_B'] = River['Arno'][['Rainfall_Stia','Rainfall_Consuma','Rainfall_Incisa','Rainfall_Montevarchi'
                                                 ,'Rainfall_S_Savino','Rainfall_Laterina','Rainfall_Bibbiena','Rainfall_Camaldoli']].sum(axis = 1)

In [None]:
sns.heatmap(River['Arno'].groupby(River['Arno'].Date.dt.year).count().transpose())

In [None]:
River['Arno'].plot.line(x='Date',y=['Rainfall_Zona_A','Rainfall_Zona_B'])