In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns # Visualization
import matplotlib.pyplot as plt # Visualization

from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

import warnings # Supress warnings 
warnings.filterwarnings('ignore')

**Importing Dataset**

In [None]:
df_Aquifer_Petrignano = pd.read_csv("../input/acea-water-prediction/Aquifer_Petrignano.csv")
df_Aquifer_Petrignano.head()

In [None]:
df_Lake_Bilancino = pd.read_csv("../input/acea-water-prediction/Lake_Bilancino.csv")
df_Lake_Bilancino.head()

In [None]:
df_River_Arno = pd.read_csv("../input/acea-water-prediction/River_Arno.csv")
df_River_Arno.head()

In [None]:
df_Water_Spring_Lupa = pd.read_csv("../input/acea-water-prediction/Water_Spring_Lupa.csv")
df_Water_Spring_Lupa.head()

**HeatMap**

In [None]:
sns.set(rc={'figure.figsize':(11,8)})
sns.heatmap(df_Aquifer_Petrignano.isnull(),yticklabels=False,cbar=False,cmap="coolwarm")

In [None]:
sns.set(rc={'figure.figsize':(11,8)})
sns.heatmap(df_Lake_Bilancino.isnull(),yticklabels=False,cbar=False,cmap="coolwarm")

In [None]:
sns.set(rc={'figure.figsize':(11,8)})
sns.heatmap(df_River_Arno.isnull(),yticklabels=False,cbar=False,cmap="coolwarm")

In [None]:
sns.set(rc={'figure.figsize':(11,8)})
sns.heatmap(df_Water_Spring_Lupa.isnull(),yticklabels=False,cbar=False,cmap="coolwarm")

**Missing Data Analysis**

Checking missing values

In [None]:
pd.Series(df_Aquifer_Petrignano.isnull().sum()).rename_axis('features/target').to_frame('Missing Value Count')

In [None]:
pd.Series(df_Lake_Bilancino.isnull().sum()).rename_axis('features/target').to_frame('Missing Value Count')

In [None]:
pd.Series(df_River_Arno.isnull().sum()).rename_axis('features/target').to_frame('Missing Value Count')

In [None]:
pd.Series(df_Water_Spring_Lupa.isnull().sum()).rename_axis('features/target').to_frame('Missing Value Count')

**Handling Missing Data**

df_Aquifer_Petrignano

In [None]:
# Droping data before 2009 for this implementation
df_Aquifer_Petrignano = df_Aquifer_Petrignano[df_Aquifer_Petrignano.Rainfall_Bastia_Umbra.notna()].reset_index(drop=True)
# Droping one of the target columns, so we can focus on only one target
df_Aquifer_Petrignano = df_Aquifer_Petrignano.drop(['Depth_to_Groundwater_P24', 'Temperature_Petrignano'], axis=1)
df_Aquifer_Petrignano.head()

In [None]:
# Simplify column names
df_Aquifer_Petrignano.columns = ['Date', 'Rainfall', 'Depth_to_Groundwater', 'Temperature', 'Drainage_Volume', 'River_Hydrometry']

targets = ['Depth_to_Groundwater']
features = [feature for feature in df_Aquifer_Petrignano.columns if feature not in targets]
df_Aquifer_Petrignano.head()

In [None]:
df_Aquifer_Petrignano['Drainage_Volume'] = df_Aquifer_Petrignano['Drainage_Volume'].interpolate()
df_Aquifer_Petrignano['River_Hydrometry'] = df_Aquifer_Petrignano['River_Hydrometry'].interpolate()
df_Aquifer_Petrignano['Depth_to_Groundwater'] = df_Aquifer_Petrignano['Depth_to_Groundwater'].interpolate()

df_Lake_Bilancino

In [None]:
#dropping rows before 2004 due to NaN values in multiple columns
df_Lake_Bilancino = df_Lake_Bilancino[df_Lake_Bilancino.Temperature_Le_Croci.notna()].reset_index(drop=True)
df_Lake_Bilancino.head()

df_River_Arno

In [None]:
#dropping rows before 2004 due to NaN values in multiple columns
df_River_Arno = df_River_Arno[df_River_Arno.Rainfall_Le_Croci.notna()].reset_index(drop=True)
df_River_Arno.head()

Due to high number of missing values in values (over 44%), we'll use for prediction 'Temperature_Firenze' and the 5 rainfall indicators that have the least missing values: 'Rainfall_Le_Croci', 'Rainfall_Cavallina', 'Rainfall_S_Agata', 'Rainfall_Mangona', 'Rainfall_S_Piero'. We'll also create the 'rainfall_mean' feature.
These Rainfalls have a higher correlation than the ones that will remain.

In [None]:
df_River_Arno = df_River_Arno[['Date','Hydrometry_Nave_di_Rosano', 'Temperature_Firenze', 
                      'Rainfall_Le_Croci', 'Rainfall_Cavallina', 
                      'Rainfall_S_Agata', 'Rainfall_Mangona', 'Rainfall_S_Piero']]

# #Missing values will be replaced with mean values
# df_River_Arno = df_River_Arno.apply(lambda x: x.fillna(np.mean(x)))

df_River_Arno['rainfall_mean'] = df_River_Arno[['Rainfall_Le_Croci', 'Rainfall_Cavallina', 
                                    'Rainfall_S_Agata', 'Rainfall_Mangona', 
                                    'Rainfall_S_Piero']].mean(axis = 1).values

df_River_Arno

In [None]:
df_River_Arno['Hydrometry_Nave_di_Rosano'] = df_River_Arno['Hydrometry_Nave_di_Rosano'].interpolate()
df_River_Arno['Temperature_Firenze'] = df_River_Arno['Temperature_Firenze'].interpolate()

df_Water_Spring_Lupa:<br>
we'll use for prediction all the data from 2009 to 2020. Missing values will be fill by 'interpolate' method.

In [None]:
df_Water_Spring_Lupa['Flow_Rate_Lupa'] = df_Water_Spring_Lupa['Flow_Rate_Lupa'].interpolate()
df_Water_Spring_Lupa

In [None]:
#dropping rows due to NaN values in target column
df_Water_Spring_Lupa = df_Water_Spring_Lupa[df_Water_Spring_Lupa.Flow_Rate_Lupa.notna()].reset_index(drop=True)
df_Water_Spring_Lupa.head()

**Correlation Analysis**

In [None]:
plt.figure(figsize=(10, 5))
mask = np.triu(np.ones_like(df_Aquifer_Petrignano.corr(), dtype=bool))
sns.heatmap(df_Aquifer_Petrignano.corr(), mask = mask, annot=True, cmap='Dark2');

In [None]:
plt.figure(figsize=(10, 5))
mask = np.triu(np.ones_like(df_Lake_Bilancino.corr(), dtype=bool))
sns.heatmap(df_Lake_Bilancino.corr(), mask = mask, annot=True, cmap='Dark2');

In [None]:
plt.figure(figsize=(10, 5))
mask = np.triu(np.ones_like(df_River_Arno.corr(), dtype=bool))
sns.heatmap(df_River_Arno.corr(), mask = mask, annot=True, cmap='Dark2');

In [None]:
plt.figure(figsize=(10, 5))
mask = np.triu(np.ones_like(df_Water_Spring_Lupa.corr(), dtype=bool))
sns.heatmap(df_Water_Spring_Lupa.corr(), mask = mask, annot=True, cmap='Dark2');

**Time Series Analysis**

In [None]:
# modifying date column for time series analysis
from datetime import datetime, date 

df_Aquifer_Petrignano['Date'] = pd.to_datetime(df_Aquifer_Petrignano.Date, format = '%d/%m/%Y')
df_Aquifer_Petrignano.head().style.set_properties(subset=['Date'], **{'background-color': 'lightblue'})

In [None]:
# modifying date column for time series analysis
from datetime import datetime, date 

df_Lake_Bilancino['Date'] = pd.to_datetime(df_Lake_Bilancino.Date, format = '%d/%m/%Y')
df_Lake_Bilancino.head().style.set_properties(subset=['Date'], **{'background-color': 'lightblue'})

In [None]:
# modifying date column for time series analysis
from datetime import datetime, date 

df_River_Arno['Date'] = pd.to_datetime(df_River_Arno.Date, format = '%d/%m/%Y')
df_River_Arno.head().style.set_properties(subset=['Date'], **{'background-color': 'lightblue'})

In [None]:
# modifying date column for time series analysis
from datetime import datetime, date 

df_Water_Spring_Lupa['Date'] = pd.to_datetime(df_Water_Spring_Lupa.Date, format = '%d/%m/%Y')
df_Water_Spring_Lupa.head().style.set_properties(subset=['Date'], **{'background-color': 'lightblue'})

In [None]:
df_Aquifer_Petrignano.info()

In [None]:
df_Lake_Bilancino.info()

In [None]:
df_River_Arno.info()

In [None]:
df_Water_Spring_Lupa.info()

**Training and Test Data Creation**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor , plot_importance, plot_tree
from sklearn.ensemble import RandomForestRegressor

df_Aquifer_Petrignano

In [None]:
X_Petrignano = df_Aquifer_Petrignano.drop(['Depth_to_Groundwater'],axis = 1)
y_Petrignano = df_Aquifer_Petrignano[['Depth_to_Groundwater']]

X_Petrignano_train, X_Petrignano_test, y_Petrignano_train, y_Petrignano_test = train_test_split(X_Petrignano, y_Petrignano, train_size = 0.7, shuffle = False)
X_Petrignano_train.set_index("Date", inplace = True)
X_Petrignano_test.set_index("Date", inplace = True)

XGBRegressor

In [None]:
params = {'n_estimators': 100,
          'max_depth': 4,
          'subsample': 0.7,
          'learning_rate': 0.5,
          'random_state': 0}

model_ap = XGBRegressor(**params)
model_ap.fit(X_Petrignano_train, y_Petrignano_train)

In [None]:
y_pred_ap = model_ap.predict(X_Petrignano_test)

score_mae = mean_absolute_error(y_Petrignano_test, y_pred_ap)
score_rmse = math.sqrt(mean_squared_error(y_Petrignano_test, y_pred_ap))


print('MAE  for Depth_to_Groundwater using XGBRegressor is : {}'.format(score_mae))
print('RMSE for Depth_to_Groundwater using XGBRegressor is : {}'.format(score_rmse))

In [None]:
model_ap.get_booster().get_score()

In [None]:
plot_importance(model_ap)

RandomForestRegressor

In [None]:
regr_ap = RandomForestRegressor(max_depth=2, random_state=0)
regr_ap.fit(X_Petrignano_train, y_Petrignano_train)
regr_ap

In [None]:
y_pred = regr_ap.predict(X_Petrignano_test)
score_mae = mean_absolute_error(y_Petrignano_test, y_pred)
score_rmse = math.sqrt(mean_squared_error(y_Petrignano_test, y_pred))

print('MAE  for Depth_to_Groundwater using RandomForestRegressor is : {}'.format(score_mae))
print('RMSE for Depth_to_Groundwater using RandomForestRegressor is : {}'.format(score_rmse))

In [None]:
feat_importances = pd.Series(regr_ap.feature_importances_, index=X_Petrignano_test.columns)
feat_importances.nlargest(20).plot(kind='barh')

df_Lake_Bilancino

In [None]:
X_Bilancino = df_Lake_Bilancino.drop(['Lake_Level', 'Flow_Rate'],axis = 1)
y_Bilancino = df_Lake_Bilancino[['Lake_Level', 'Flow_Rate']]

X_Bilancino_train, X_Bilancino_test, y_Bilancino_train, y_Bilancino_test = train_test_split(X_Bilancino, y_Bilancino, train_size = 0.7, shuffle = False)
X_Bilancino_train.set_index("Date", inplace = True)
X_Bilancino_test.set_index("Date", inplace = True)

In [None]:
y_train_Lake_Level = y_Bilancino_train[['Lake_Level']]
y_train_Flow_Rate = y_Bilancino_train[['Flow_Rate']]
y_test_Lake_Level = y_Bilancino_test[['Lake_Level']]
y_test_Flow_Rate = y_Bilancino_test[['Flow_Rate']]

XGBRegressor

Lake Level

In [None]:
params = {'n_estimators': 100,
          'max_depth': 4,
          'subsample': 0.7,
          'learning_rate': 0.5,
          'random_state': 0}

model_ll = XGBRegressor(**params)
model_ll.fit(X_Bilancino_train, y_train_Lake_Level)

In [None]:
y_pred_ll = model_ll.predict(X_Bilancino_test)

score_mae = mean_absolute_error(y_test_Lake_Level, y_pred_ll)
score_rmse = math.sqrt(mean_squared_error(y_test_Lake_Level, y_pred_ll))


print('MAE  for Lake_Level using XGBRegressor is : {}'.format(score_mae))
print('RMSE for Lake_Level using XGBRegressor is : {}'.format(score_rmse))

In [None]:
model_ll.get_booster().get_score()

In [None]:
plot_importance(model_ll)

Flow Rate

In [None]:
model_fr = XGBRegressor(**params)
model_fr.fit(X_Bilancino_train, y_train_Flow_Rate)

In [None]:
y_pred_fr = model_fr.predict(X_Bilancino_test)
score_mae = mean_absolute_error(y_test_Flow_Rate, y_pred_fr)
score_rmse = math.sqrt(mean_squared_error(y_test_Flow_Rate, y_pred_fr))


print('MAE  for Flow_Rate using XGBRegressor is : {}'.format(score_mae))
print('RMSE for Flow_Rate using XGBRegressor is : {}'.format(score_rmse))

In [None]:
model_fr.get_booster().get_score()

In [None]:
plot_importance(model_ll)

RandomForestRegressor

Lake Level

In [None]:
regr_ll = RandomForestRegressor(max_depth=2, random_state=0)
regr_ll.fit(X_Bilancino_train, y_train_Lake_Level)
regr_ll

In [None]:
y_pred_reg_ll = regr_ll.predict(X_Bilancino_test)
score_mae = mean_absolute_error(y_test_Lake_Level, y_pred_reg_ll)
score_rmse = math.sqrt(mean_squared_error(y_test_Lake_Level, y_pred_reg_ll))

print('MAE  for Lake_Level using RandomForestRegressor is : {}'.format(score_mae))
print('RMSE for Lake_Level using RandomForestRegressor is : {}'.format(score_rmse))

In [None]:
feat_importances = pd.Series(regr_ll.feature_importances_, index=X_Bilancino_test.columns)
feat_importances.nlargest(20).plot(kind='barh')

Flow Rate

In [None]:
regr_fr = RandomForestRegressor(max_depth=2, random_state=0)
regr_fr.fit(X_Bilancino_train, y_train_Flow_Rate)
regr_fr

In [None]:
y_pred_reg = regr_fr.predict(X_Bilancino_test)
score_mae = mean_absolute_error(y_test_Flow_Rate, y_pred_reg)
score_rmse = math.sqrt(mean_squared_error(y_test_Flow_Rate, y_pred_reg))

print('MAE  for Flow_Rate using RandomForestRegressor is : {}'.format(score_mae))
print('RMSE for Flow_Rate using RandomForestRegressor is : {}'.format(score_rmse))

In [None]:
feat_importances = pd.Series(regr_fr.feature_importances_, index=X_Bilancino_test.columns)
feat_importances.nlargest(20).plot(kind='barh')

df_River_Arno

In [None]:
X_Arno = df_River_Arno.drop('Hydrometry_Nave_di_Rosano', axis = 1)
y_Arno = df_River_Arno['Hydrometry_Nave_di_Rosano']

X_Arno_train, X_Arno_test, y_Arno_train, y_Arno_test = train_test_split(X_Arno, y_Arno, train_size = 0.7, shuffle = False)
X_Arno_train.set_index("Date", inplace = True)
X_Arno_test.set_index("Date", inplace = True)

XGBRegressor

In [None]:
params = {'n_estimators': 100,
          'max_depth': 4,
          'subsample': 0.7,
          'learning_rate': 0.04,
          'random_state': 0 }

model_ar = XGBRegressor(**params)
model_ar.fit(X_Arno_train, y_Arno_train)

In [None]:
y_pred_ar = model_ar.predict(X_Arno_test)

score_mae = mean_absolute_error(y_Arno_test, y_pred_ar)
score_rmse = math.sqrt(mean_squared_error(y_Arno_test, y_pred_ar))


print('MAE  for Lake_Level using XGBRegressor is : {}'.format(score_mae))
print('RMSE for Lake_Level using XGBRegressor is : {}'.format(score_rmse))

In [None]:
model_ar.get_booster().get_score()

In [None]:
plot_importance(model_ar)

RandomForestRegressor

In [None]:
regr_ar = RandomForestRegressor(max_depth=2, random_state=0)
regr_ar.fit(X_Arno_train, y_Arno_train)
regr_ar

In [None]:
y_pred_reg = regr_ar.predict(X_Arno_test)
score_mae = mean_absolute_error(y_Arno_test, y_pred_reg)
score_rmse = math.sqrt(mean_squared_error(y_Arno_test, y_pred_reg))

print('MAE  for Flow_Rate using RandomForestRegressor is : {}'.format(score_mae))
print('RMSE for Flow_Rate using RandomForestRegressor is : {}'.format(score_rmse))

In [None]:
feat_importances = pd.Series(regr_ar.feature_importances_, index=X_Arno_test.columns)
feat_importances.nlargest(20).plot(kind='barh')

df_Water_Spring_Lupa

In [None]:
X_Lupa = df_Water_Spring_Lupa.drop(['Flow_Rate_Lupa'],axis = 1)
y_Lupa = df_Water_Spring_Lupa['Flow_Rate_Lupa']

X_Lupa_train, X_Lupa_test, y_Lupa_train, y_Lupa_test = train_test_split(X_Lupa, y_Lupa, train_size = 0.7, shuffle = False)
X_Lupa_train.set_index("Date", inplace = True)
X_Lupa_test.set_index("Date", inplace = True)

XGBRegressor

In [None]:
params = {'n_estimators': 200,
          'max_depth': 2,
          'subsample': 1,
          'learning_rate': 0.03,
          'random_state': 0}

model_lu = XGBRegressor(**params)
model_lu.fit(X_Lupa_train, y_Lupa_train)

In [None]:
y_pred_lu = model_lu.predict(X_Lupa_test)

score_mae = mean_absolute_error(y_Lupa_test, y_pred_lu)
score_rmse = math.sqrt(mean_squared_error(y_Lupa_test, y_pred_lu))


print('MAE  for Lake_Level using XGBRegressor is : {}'.format(score_mae))
print('RMSE for Lake_Level using XGBRegressor is : {}'.format(score_rmse))

RandomForestRegressor

In [None]:
regr_lu = RandomForestRegressor(max_depth=2, random_state=0)
regr_lu.fit(X_Lupa_train, y_Lupa_train)
regr_lu

In [None]:
y_pred_reg = regr_lu.predict(X_Lupa_test)
score_mae = mean_absolute_error(y_Lupa_test, y_pred_reg)
score_rmse = math.sqrt(mean_squared_error(y_Lupa_test, y_pred_reg))

print('MAE  for Flow_Rate using RandomForestRegressor is : {}'.format(score_mae))
print('RMSE for Flow_Rate using RandomForestRegressor is : {}'.format(score_rmse))

Since we have one predictor only, it has only one important feature