In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import boxcox

# plotting
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
#pd.options.plotting.backend = "plotly"

# settings
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (16, 8)

import tensorflow as tf

pd.options.display.max_columns = 500

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

In this notebook I'll try to approach the Acea Smart Water Analytics competion. 
The Acea Group is one of the leading Italian multiutility operators. The company manages and develops water and electricity networks and environmental services. I'm a bit familiar with Acea and the names in the datasets since I live in Italy. 

In this notebook, I don't plan to address all the waterbodies but I plan to outline an precise pipeline (see their schema below):
* to analyze the different datasets grouped by waterbody type
* clean the data
* build new features
* train and validate forecasting model(s)
\
\
![image](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F6195295%2Fcca952eecc1e49c54317daf97ca2cca7%2FAcea-Input.png?generation=1606932492951317&alt=media)
\
\
I'll try to keep it as simple as possible. In particular I will start with simple EDAs, simple models and then try to add complexity. 



# Retrieve source datasets

In notebook I want to focus on a specific waterbody: Lake Bilancino.
\
In the following, I plan to test and experiment building a multi target model. Indeed, the related dataset has two attributes that need to be predicted:
* the Flow Rate
* the Lake Level

In the next cells, I will implement a rather simple data pipeline to clean data and build features; then, I will build a multioutput model. 

In [None]:
from datetime import datetime
custom_date_parser = lambda x: datetime.strptime(x, '%d/%m/%Y')

df = {}
bodytype = {'Aquifer','Water','Lake','River'}
Lake = {}

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        if filename[-4:]=='.csv':
            if filename.startswith('Lake'):
                Lake[filename[:-4].replace('Lake_','')] = pd.read_csv(
                    os.path.join(dirname, filename)
                   ,parse_dates = ['Date'],date_parser = custom_date_parser
                )
lake_df = Lake['Bilancino']
del Lake

In [None]:
lake_df.info()

In [None]:
features =[
    'Rainfall_S_Piero', 'Rainfall_Mangona', 'Rainfall_S_Agata',
       'Rainfall_Cavallina', 'Rainfall_Le_Croci', 'Temperature_Le_Croci'
] 
labels = [
    'Lake_Level', 'Flow_Rate'
]
data = lake_df.copy()

print(data.describe().transpose())
print('\n')
for column in features + labels:
    data.plot.line(x='Date',y=column,figsize = (20,9),title = column.replace('_',' '))

# Feature Engineering

In this section, you can find the code related to data exploration, data cleansing and feature engineering.

Let's start building time features relate to column date (format: dd/mm/yyyy), in particular:
* year (integer)
* month (integer)
* day_in_year (integer from 1 to 366)
* week_in_year (integer from 1 to 55)
* year sin (float), defined as the sin(2 * pi * day_in_year / 365.25)
* year cos (float), defined as the cos(2 * pi * day_in_year / 365.25)

year sin/cos are particullary useful to easily catch seasons. 

In [None]:
# aggiustare indice con datetime
def get_time_features(df):
    data = df.copy()
    data['year'] = data.Date.dt.year
    data['month'] = data.Date.dt.month
    data['day_in_year'] = data.Date.dt.dayofyear
    data['week_in_year'] = data.Date.dt.isocalendar().week.astype(int)
    data['year sin'] = np.sin(2*np.pi*data['day_in_year']/365.25)
    data['year cos'] = np.cos(2*np.pi*data['day_in_year']/365.25)
    return data

data = get_time_features(data)

Let's have a clear visualization o missing values and attributes quality.

In [None]:
# controllare null 
nulls = data.groupby(
    'year'
).agg('count').transpose()
sns.heatmap(nulls)
del nulls

We see that :
* there are missing values in 2002 and 2003
* we have only half (first six month) the values for 2020

I decided to restrict this analysis to the data for years in 2004 and 2019.

In [None]:
data = data[(data.year > 2003)&(data.year<2020)]
data.Temperature_Le_Croci[data.Temperature_Le_Croci.isna()]=6

## Temperature Seasonality

In the following we decompose the Temperature decomposition. In the following cells, we visualize:
* autocorrelation plot to the detect possible period values
* Temperature Trend, Seasonality and Residual components

In [None]:
pd.plotting.autocorrelation_plot(data.Temperature_Le_Croci)
plt.show()

# controllare seasonality, trend e resid degli attributi
from statsmodels.tsa.seasonal import seasonal_decompose

decomposed = seasonal_decompose(
    data['Temperature_Le_Croci'], 
    period = 365,
    extrapolate_trend = 'freq'
)

decomposed.plot()
plt.show()

In [None]:
data['Temperature_Trend'] = decomposed.trend
data['Temperature_Season'] = decomposed.seasonal
data['Temperature_Resid'] = decomposed.resid

In [None]:
# sistemare flow rate

#data['Flow_Rate'] = np.abs(data['Flow_Rate'])

In [None]:
pd.plotting.scatter_matrix(data[features + ['Temperature_Trend','Temperature_Season','Temperature_Resid','year sin','year cos'] + labels ], figsize = (20,20))
plt.show()

# Targets Visualization

With the following visualizations, we focus on the target variables and their scaled versions. These are useful to decide which model we can opt for each variable.

In [None]:
fig, ax = plt.subplots(2, 2)

ax[0,0].set_title('Lake Level')

ax[0,0].hist(data.Lake_Level, bins = 50, density = True)
ax[0,0].set_ylabel('Lake_Level')

ax[1,0].hist(boxcox(1- data.Lake_Level + data.Lake_Level.max(),0), bins = 50, density = True)
ax[0,0].set_ylabel('Scaled Lake_Level')


ax[0,1].set_title('Flow Rate')

ax[0,1].hist(data.Flow_Rate, bins = 50, density = True)
ax[0,1].set_ylabel('Flow_Rate')

ax[1,1].hist(1 + np.log(1 + data.Flow_Rate), bins = 50, density = True)
ax[1,1].set_ylabel('Scaled Flow Rate')


plt.show()

In [None]:
def split_data(df, frac=0.8):
    data = df.copy()
    data.sort_index(inplace = True)
    n = len(data)
    return data[:int(n*frac)],data[int(n*frac):]

In [None]:
from scipy.stats import boxcox

train, val = split_data(data,0.8)

train.set_index('Date',inplace = True)
train_mean = train.mean()
train_std = train.std()
train_df = (train - train_mean)/train_std
maxima = max(train_df.Lake_Level)

train_df.Lake_Level =  boxcox(1- train_df.Lake_Level + maxima,0)
train_df.Flow_Rate = 1 + np.log(1 + train_df.Flow_Rate)

In [None]:
#pd.plotting.scatter_matrix(train_df[features + ['Temperature_Trend','Temperature_Season','Temperature_Resid','year sin','year cos'] + labels ], figsize = (20,20))
#plt.show()

In [None]:
num_days = 1
sns.heatmap(pd.concat(
    [train_df[labels],
     train_df[features + ['Temperature_Trend','Temperature_Season','Temperature_Resid','year sin','year cos']  ].shift(num_days).add_suffix('_shifted')
    #,train_df[labels ].shift(30).add_suffix('_shifted')
    ]
    ,axis = 1).dropna().corr()
            ,annot = True)
plt.plot()

We build the dataset as follows:
* target variables : Flow_Rate and Lake_Level
* Rainfall features shifted by 1 (day): Rainfall_S_Piero_shifted, Rainfall_Mangona_shifted, Rainfall_S_Agata_shifted, Rainfall_Cavallina_shifted, Rainfall_Le_Croci_shifted
* Lake_Level and Flow_Rate shifted by 30 (days)
* Temperature features shifted by 1 (day): Temperature_Season_shifted, Temperature_Trend_shifted
* Time features: year sin, year cos
\
\
I decided to opt for Lake_Level and Flow_Rate shifted by 30 because I assume that this is an information available in "production".

In [None]:
model_features = ['Rainfall_S_Piero_shifted', 'Rainfall_Mangona_shifted', 'Rainfall_S_Agata_shifted',
       'Rainfall_Cavallina_shifted', 'Rainfall_Le_Croci_shifted',
       #'Lake_Level_shifted', 'Flow_Rate_shifted', 
                  'year sin_shifted', 'year cos_shifted', 'Temperature_Trend_shifted',
       'Temperature_Season_shifted']
train_df = pd.concat([train_df[labels],
                      train_df[features + ['Temperature_Trend','Temperature_Season','Temperature_Resid','year sin','year cos']].shift(1).add_suffix('_shifted'),
                     # train_df[labels ].shift(30).add_suffix('_shifted')
                     ],axis = 1).dropna().iloc[1:]

# Multitarget Model (XGBRegressor)

I found three different possibilities to build a multioutput model:
* use [MultiOutputRegressor from scikit.multioutput](https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html) but I couldn't opt for different estimators
* use a multistep Tensorflow model for forecasting (see [here](https://www.tensorflow.org/tutorials/structured_data/time_series))
* build from scratch, so that I can adopt different models to adapt better to target distributions

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_tweedie_deviance

model = {
    'Lake_Level': {
        'estimator':xgb.XGBRegressor(objective = 'reg:squarederror',n_jobs = 1),
        'param_grid' : {'max_depth': [2,3, 4],
                                 'n_estimators': [50, 100], 
                                 'alpha': [0,0.01,0.001]
                                },
        'scoring':'neg_mean_absolute_error'
    },
    'Flow_Rate' : {
        'estimator':xgb.XGBRegressor(objective = 'reg:tweedie',n_jobs = 1),
        'param_grid': {'max_depth': [2,3,4],
                                 'n_estimators': [50, 100], 
                                 'alpha': [0,0.01,0.001]
                                 ,'objective': ['reg:tweedie']
                                 ,'tweedie_variance_power':[1,2]
                                },
        'scoring':'neg_mean_gamma_deviance'
    }
    
}


X,Y = train_df[model_features],train_df[labels]


for label in labels:
    clf = GridSearchCV(
        model[label]['estimator'], 
        scoring = model[label]['scoring'],
        param_grid = model[label]['param_grid'], 
        verbose=1, n_jobs=1, cv = KFold(n_splits=7, shuffle=False)
    )
    clf.fit(X.values, Y[label].values)
    print('\nGridSearchCV for '+ label)
    print('\nBest Score:')
    print(clf.best_score_)
    print('\nBest Parameters:')
    print(clf.best_params_)
    model[label]['clf'] = clf
    
    kf = KFold(n_splits=2, shuffle=False)
    for train_index, test_index in kf.split(X):
        reg = model[label]['clf'].best_estimator_
        predictions = reg.predict(X.iloc[test_index,:].values)
        actuals = Y[label].iloc[test_index].values
        print('\n'+label + ' MSE:')
        print(mean_squared_error(actuals, predictions))
        print('\n'+label+' MAE:')
        print(mean_absolute_error(actuals, predictions))
        print('\n'+label+' Poisson Deviance (p = 1):')
        print(mean_tweedie_deviance(1 + actuals, 1 + predictions,power = 1))
        print('\n'+label+'Gamma Deviance (p = 2):')
        print(mean_tweedie_deviance(1 + actuals, 1 + predictions,power = 2))
    fig, ax = plt.subplots()

    fig1 = ax.bar(
        x = range(len(reg.feature_importances_)),
        height = reg.feature_importances_, 
        tick_label = model_features
    )
    ax.set_ylabel('Importances')
    ax.set_title('Model '+label+': Features Importances')
    plt.xticks(rotation = 45)
    fig.tight_layout()

    plt.show() 

In [None]:
# costruisco il dataset da quello reale
df = data.copy()
df.set_index('Date',inplace = True)
df = df[features + ['Temperature_Trend','Temperature_Season','Temperature_Resid','year sin','year cos'] + labels]
#normalize dataset with previous parameters
df = (df - train_mean)/train_std

minima = abs((-train_df.Lake_Level).min())

df.Lake_Level =  boxcox(1- df.Lake_Level + maxima,0)
df.Flow_Rate = 1 + np.log(1 + df.Flow_Rate)

#shift 1
df = pd.concat([df[labels]
                ,df[features + ['Temperature_Trend','Temperature_Season','Temperature_Resid','year sin','year cos']].shift(1).add_suffix('_shifted')
                #,df[labels].shift(30).add_suffix('_shifted')
          ]
          ,axis = 1).iloc[30:]


#droppo la prima riga che avrÃ  ,molti null
df['Lake_Level_predicted'] = 0
df['Flow_Rate_predicted'] = 0

for i,row in df.iterrows():
    #if row['Lake_Level_shifted'] == np.nan:
    #    row['Lake_Level_shifted'] = df.iloc[i-1,'Lake_Level_predicted']
    #if row['Flow_Rate_shifted'] == np.nan:
    #    row['Flow_Rate_shifted'] = df.iloc[i-1,'Flow_Rate_predicted']
        
    x = np.asarray(pd.DataFrame(row).T[model_features])
    df.loc[i,'Lake_Level_predicted'] = model['Lake_Level']['clf'].best_estimator_.predict(x)
    df.loc[i,'Flow_Rate_predicted'] = model['Flow_Rate']['clf'].best_estimator_.predict(x)
    
# scorrendo iterativamente dalla prima riga a quella successiva proseguo come segue
## se Hydrometry shift 1 Ã¨ null allora faccio la prediction tramite la il valore precedentemente predetto
from scipy.special import inv_boxcox

df.Lake_Level = (-inv_boxcox(df.Lake_Level,0) + 1 + maxima)*train_std.Lake_Level + train_mean.Lake_Level
df.Lake_Level_predicted = (-inv_boxcox(df.Lake_Level_predicted,0) + 1 + maxima)*train_std.Lake_Level + train_mean.Lake_Level
#df.Lake_Level = (-df.Lake_Level + minima)*train_std.Lake_Level + train_mean.Lake_Level 
#df.Lake_Level_predicted = (-df.Lake_Level_predicted + minima)*train_std.Lake_Level + train_mean.Lake_Level 
df.Flow_Rate = (np.exp(df.Flow_Rate - 1) - 1)*train_std.Flow_Rate + train_mean.Flow_Rate
df.Flow_Rate_predicted = (np.exp(df.Flow_Rate_predicted - 1) - 1)*train_std.Flow_Rate + train_mean.Flow_Rate

df['Lake_Level_Imputed'] = df.Lake_Level.fillna(df.Lake_Level_predicted)
df['Flow_Rate_Imputed'] = df.Flow_Rate.fillna(df.Flow_Rate_predicted)
 
df.reset_index().plot.line(x = 'Date', y = ['Lake_Level_predicted','Lake_Level'])
df.reset_index().plot.line(x = 'Date', y = ['Flow_Rate_predicted','Flow_Rate'])
plt.show()

# Work in progress
To do:
* try with a (tensorflow?) model with multiple times steps 
* Poisson Regressor with Tensorflow?
* Probabilistic ML model?