# Intro
Welcome to the [Acea Smart Water Analytics](https://www.kaggle.com/c/acea-water-prediction) competition.
![](https://storage.googleapis.com/kaggle-competitions/kaggle/24191/logos/header.png)

There are different waterbodies with different features. We will consider 
* Aquifer,
* Water Spring,
* Lake,
* River.

The goal is to predict the amount of water in each unique waterbody.

<span style="color: royalblue;">Please vote the notebook up if it helps you. Thank you. </span>

# Libraries
We load some standard libraries and packages of sklearn.

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

# Path
Define input path and show content files.

In [None]:
path = '/kaggle/input/acea-water-prediction/'
files = os.listdir(path)
files

# Waterbodies
Write the waterbodies into a list. 

In [None]:
waterbodies = [wb.split('.')[0] for wb in files if '.csv' in wb]
print('number of waterbodies:', len(waterbodies))
waterbodies

# Functions
We define some helper functions for ploting, feature engineering and calculations.

In [None]:
def load_data(waterbody):
    """ Load data by name of the waterbody """
    
    file = waterbody+'.csv'
    data = pd.read_csv(path+file, index_col=0, parse_dates=True)
    data.sort_index(inplace=True)
    return data

def plot_timeseries(data, feature):
    """ Plot a feature as timeseries """
    
    fig = plt.figure(figsize=(9, 3))
    ax = fig.add_subplot(111)
    x = data.index
    y = data[feature]
    ax.plot(x, y, label=feature)
    plt.legend()
    plt.xlabel('date')
    plt.grid()
    plt.title(feature)
    plt.show()
    
def label_missing_values(s):
    """ Label missing values (=0) with None """
    if s == 0:
        return None
    else:
        return s
    
def plot_compare_before_after(data, y_test, target):
    """ Compare target before and after prediction """
    
    fig, axs = plt.subplots(1, 2, figsize=(18, 4))
    fig.subplots_adjust(hspace = 0.5, wspace=0.2)
    axs = axs.ravel()
    x = data.index
    axs[0].plot(x, data[target])
    axs[1].plot(x, data[target], label='train')
    axs[1].plot(y_test.index, y_test.values, label='pred', alpha=0.7)
    for i in range(2):
        axs[i].grid()
        axs[i].set_xlabel('date')
        axs[i].legend()
    axs[0].set_title('Before')
    axs[1].set_title('After')

    plt.show()
    
def plot_corr_matrix(data):
    """ Plot the correlation matrix of all features """
    
    corr = data.corr()
    return corr.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)


def timestamp_features(data):
    """ Create new features based on the timestamp index """
    
    data['year']=data.index.year
    data['month']=data.index.month
    data['day']=data.index.day
    
    return data

def scale_data(data, targets):
    """ Z-score Normalization of feature data """
    
    data[data.columns.difference(targets)] -= data[data.columns.difference(targets)].mean()
    data[data.columns.difference(targets)] /= data[data.columns.difference(targets)].std()
    return data

def create_train_and_test(data, targets, target, by_value=0):
    """ Create train and test data for modelleing and prediction """
    
    if by_value==0:
        X_train = data[data.columns.difference(targets)][data[target]!=0]
        y_train = data[target][data[target]!=0]
        
        X_test = data[data.columns.difference(targets)][data[target]==0]
        y_test = data[target][data[target]==0]
        
    if by_value==None:
        X_train = data[data.columns.difference(targets)][data[target].notnull()]
        y_train = data[target][data[target].notnull()]
        
        X_test = data[data.columns.difference(targets)][data[target].isnull()]
        y_test = data[target][data[target].isnull()]
        
    return X_train, y_train, X_test, y_test

def get_best_model_and_accuracy(model, params, X, y):
    """ GridSearch for given model and parameters """
    
    grid = GridSearchCV(model, params,
                        error_score=0.,
                        scoring='neg_root_mean_squared_error')
    grid.fit(X, y)
    
    score = -grid.best_score_
    time_fit = grid.cv_results_['mean_fit_time'].mean()
    time_score = grid.cv_results_['mean_score_time'].mean()
    best_params = grid.best_params_
    
    print('Best score:', score)
    print('Best params:', best_params)
    print('Average Time (Fit)', round(time_fit, 3))
    print('Average Time (Score)', round(time_score, 3))
    
    return [score, time_fit, time_score, best_params]

# Overview
The datafiles are small. So we can load them on demand.

In [None]:
compare = pd.DataFrame(index=waterbodies, columns=['samples', 'features', 'features_with_nan'])
for waterbody in waterbodies:
    data = load_data(waterbody)
    compare.loc[waterbody, 'samples'] = len(data.index)
    compare.loc[waterbody, 'features'] = len(data.columns)
    counter = 0
    for col in data.columns:
        if data[col].isnull().sum():
            counter += 1
    compare.loc[waterbody, 'features_with_nan'] = counter

As we can see there are a lot of features with nan values. So we have to think about handling missing values. For that we recommend this [notebook](https://www.kaggle.com/drcapa/pima-indians-diabetes-eda-handle-missing-values).

In [None]:
compare

# Targets
Store the targets of every waterbody in a dictonary by definition.

In [None]:
targets = {'Aquifer_Doganella': ['Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_2', 'Depth_to_Groundwater_Pozzo_3',
                                 'Depth_to_Groundwater_Pozzo_4', 'Depth_to_Groundwater_Pozzo_5', 'Depth_to_Groundwater_Pozzo_6',
                                 'Depth_to_Groundwater_Pozzo_7', 'Depth_to_Groundwater_Pozzo_8', 'Depth_to_Groundwater_Pozzo_9'],
           'Aquifer_Auser': ['Depth_to_Groundwater_SAL', 'Depth_to_Groundwater_CoS', 'Depth_to_Groundwater_LT2'],
           'Water_Spring_Amiata': [ 'Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
                                    'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta'],
           'Lake_Bilancino': ['Lake_Level', 'Flow_Rate'],
           'Water_Spring_Madonna_di_Canneto': ['Flow_Rate_Madonna_di_Canneto'],
           'Aquifer_Luco': ['Depth_to_Groundwater_Podere_Casetta'],
           'Aquifer_Petrignano': ['Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25'],
           'Water_Spring_Lupa': ['Flow_Rate_Lupa'],
           'River_Arno': ['Hydrometry_Nave_di_Rosano']}

# Models
The goal is to generate four mathematical models, one for each category of waterbody (acquifers, water springs, river, lake) that might be applicable to each single waterbody.

## River Arno
The target which we want to predict is Hydrometry_Nave_di_Rosano

In [None]:
River_Arno = load_data('River_Arno')
targets['River_Arno']

Create new features based on the timestamp index:

In [None]:
River_Arno = timestamp_features(River_Arno)

Plot the target over the time:

In [None]:
plot_timeseries(River_Arno, targets['River_Arno'][0])

In [None]:
River_Arno[River_Arno[targets['River_Arno']]==0].head()

Plot the correlation matrix:

In [None]:
plot_corr_matrix(River_Arno)

In [None]:
date_from = '2005-01-01'
River_Arno = River_Arno[date_from:]

Fill missing values with column mean value:

In [None]:
River_Arno[River_Arno.columns.difference(targets['River_Arno'])] = River_Arno[River_Arno.columns.difference(targets['River_Arno'])].fillna(River_Arno[River_Arno.columns.difference(targets['River_Arno'])].mean())
River_Arno[targets['River_Arno']] = River_Arno[targets['River_Arno']].fillna(0)

Scale data:

In [None]:
River_Arno = scale_data(River_Arno, targets['River_Arno'])

Define train and test data randomly:

In [None]:
X_train, y_train, X_test, y_test = create_train_and_test(River_Arno,
                                                         targets['River_Arno'],
                                                         targets['River_Arno'][0],
                                                         by_value=0)

Grid Search with XGB:

In [None]:
model_XGB = XGBRegressor(random_state=2020)
params_XGB = {'n_estimators': [5, 10, 50, 100],
              'max_depth': [1, 2, 3, 4, 5]} 
score, time_fit, time_score, best_params = get_best_model_and_accuracy(model_XGB, params_XGB,
                                                                       X_train, y_train)

Grid Search with Linear Regression:

In [None]:
model_LR = LinearRegression()
params_LR = {}
score, time_fit, time_score, best_params = get_best_model_and_accuracy(model_LR, params_LR,
                                                                       X_train, y_train)

In [None]:
model_LR.fit(X_train, y_train)
y_pred = model_LR.predict(X_test)

In [None]:
y_test[:]=y_pred

In [None]:
River_Arno[targets['River_Arno'][0]] = River_Arno[targets['River_Arno'][0]].apply(label_missing_values)

Plot the timeseries before and after prediction. The image with the title before shows the origin data with nan values. The after image shows the origin data and the prediction for nan values.

In [None]:
plot_compare_before_after(River_Arno, y_test, targets['River_Arno'][0])

## Aquifer_Luco
There is only one target with the name Depth_to_Groundwater_Podere_Casetta.

In [None]:
Aquifer_Luco = load_data('Aquifer_Luco')
targets['Aquifer_Luco']

Plot the timeseries with missing values.

In [None]:
plot_timeseries(Aquifer_Luco, targets['Aquifer_Luco'][0])

Create new features based on the timestamp index:

In [None]:
Aquifer_Luco = timestamp_features(Aquifer_Luco)

Fill missing feature values by mean.

In [None]:
Aquifer_Luco[Aquifer_Luco.columns.difference(targets['Aquifer_Luco'])] = Aquifer_Luco[Aquifer_Luco.columns.difference(targets['Aquifer_Luco'])].fillna(Aquifer_Luco[Aquifer_Luco.columns.difference(targets['Aquifer_Luco'])].mean())

Plot the correlation matrix.

In [None]:
plot_corr_matrix(Aquifer_Luco)

Scale data:

In [None]:
Aquifer_Luco = scale_data(Aquifer_Luco, targets['Aquifer_Luco'])

Define train and test data randomly:

In [None]:
X_train, y_train, X_test, y_test = create_train_and_test(Aquifer_Luco,
                                                         targets['Aquifer_Luco'],
                                                         targets['Aquifer_Luco'][0],
                                                         by_value=None)

Modelling and Prediction:

In [None]:
model_XGB = XGBRegressor(random_state=2020)
params_XGB = {'n_estimators': [1000, 1500, 2000],
              'max_depth': [1, 2, 3, 4, 5]} 
score, time_fit, time_score, best_params_XGB = get_best_model_and_accuracy(model_XGB, params_XGB,
                                                                           X_train, y_train)

In [None]:
model_LR = LinearRegression()
params_LR = {}
score, time_fit, time_score, best_params = get_best_model_and_accuracy(model_LR, params_LR,
                                                                       X_train, y_train)

In [None]:
model_XGB = XGBRegressor(random_state=2020)
model_XGB.set_params(**best_params_XGB)
model_XGB.fit(X_train, y_train)
y_pred = model_XGB.predict(X_test)

In [None]:
y_test[:]=y_pred

In [None]:
plot_compare_before_after(Aquifer_Luco, y_test['2008-01-01':], targets['Aquifer_Luco'][0])