# Visual data exploration of different water bodies

Let's explore a representative subset of the water bodies:

[1. Lake Bilancino](#1)

[2. River Arno](#2)

[3. Aquifer Luco](#3)

[4. Water Spring Amiata](#4)

Furthermore we combine two data sources and try to gain insights from the combined data frame.

[5. Combine River Arno and Lake Bilancino data](#5)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import os
import time

# missing values visualization
import missingno as msno

# plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# file overview
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<a id="1"></a>
# 1. Lake Bilancino

Description: Bilancino lake is an artificial lake located in the municipality of Barberino di Mugello (about 50 km from Florence). It is used to refill the Arno river during the summer months. Indeed, during the winter months, the lake is filled up and then, during the summer months, the water of the lake is poured into the Arno river. It has a maximum depth of thirty-one metres and a surface area of 5 square kilometres.

Outputs: Lake_Level, Flow_Rate

In [None]:
# load data and preview
df_lake_b = pd.read_csv('../input/acea-water-prediction/Lake_Bilancino.csv')
df_lake_b.tail()

In [None]:
# dimensions
n_lake_b = df_lake_b.shape[0]
df_lake_b.shape

In [None]:
# convert date
df_lake_b.Date = pd.to_datetime(df_lake_b.Date, dayfirst=True)
# and add year and month to data frame
df_lake_b['Month'] = df_lake_b.Date.dt.month.astype('int')
df_lake_b['Year'] = df_lake_b.Date.dt.year.astype('int')
# summary
df_lake_b.describe(include='all', datetime_is_numeric=True)

In [None]:
# show structure of missings
msno.matrix(df_lake_b)
plt.show()

In [None]:
# define features
features_lake_b = ['Rainfall_S_Piero', 'Rainfall_Mangona', 'Rainfall_S_Agata',
                   'Rainfall_Cavallina', 'Rainfall_Le_Croci', 'Temperature_Le_Croci',
                   'Lake_Level', 'Flow_Rate']

# > Distributions and correlation

In [None]:
# plot distributions for all features
for f in features_lake_b:
    perc_missing = np.round(100*sum(df_lake_b[f].isna()) / n_lake_b,4)
    plt.figure(figsize=(10,4))
    df_lake_b[f].plot(kind='hist', bins=25)
    plt.title(f + ' - Missing %: '+ str(perc_missing))
    plt.grid()
    plt.show()

In [None]:
# pairwise scatterplot
t1 = time.time()
sns.pairplot(df_lake_b[features_lake_b])
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# correlations
corr_pearson = df_lake_b.corr(method='pearson')
corr_spearman = df_lake_b.corr(method='spearman')

# plot correlation matrices
fig = plt.figure(figsize = (18,8))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

# plot side by side (use vmin and vmax to sync the colors!)
sns.heatmap(data=corr_pearson, ax=ax1, cbar=1, square=True, vmin=-1, vmax=1, cmap='RdYlGn', cbar_kws={'shrink': .3}, annot=True)
sns.heatmap(data=corr_spearman, ax=ax2, cbar=1, square=True, vmin=-1, vmax=1, cmap='RdYlGn', cbar_kws={'shrink': .3}, annot=True)
plt.show()

# > Development over time

In [None]:
# plot rainfall
my_alpha=0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_lake_b.Date, df_lake_b.Rainfall_S_Piero , alpha=my_alpha, label='S_Piero')
ax.scatter(df_lake_b.Date, df_lake_b.Rainfall_Le_Croci, alpha=my_alpha, label='Le_Croci')
ax.scatter(df_lake_b.Date, df_lake_b.Rainfall_Mangona, alpha=my_alpha, label='Mangona')
ax.scatter(df_lake_b.Date, df_lake_b.Rainfall_S_Agata, alpha=my_alpha, label='S_Agata')
ax.scatter(df_lake_b.Date, df_lake_b.Rainfall_S_Piero, alpha=my_alpha, label='S_Piero')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
ax.legend(loc='upper left')
plt.show()

In [None]:
# plot temperature
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_lake_b.Date, df_lake_b.Temperature_Le_Croci)
ax.xaxis.set_major_locator(plt.MaxNLocator(20))
plt.xticks(rotation=90)
plt.title('Development of Temperature [Le Croci]')
plt.grid()
plt.show()

In [None]:
# plot lake level
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_lake_b.Date, df_lake_b.Lake_Level)
ax.xaxis.set_major_locator(plt.MaxNLocator(20))
plt.xticks(rotation=90)
plt.title('Development of Lake Level')
plt.grid()
plt.show()

In [None]:
# plot also (additive) changes of lake level
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_lake_b.Date, df_lake_b.Lake_Level.diff(), c='darkgreen')
ax.xaxis.set_major_locator(plt.MaxNLocator(20))
plt.xticks(rotation=90)
plt.title('Development of Lake Level - Day to Day differences')
plt.grid()
plt.show()

In [None]:
# plot flow rate
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_lake_b.Date, df_lake_b.Flow_Rate)
ax.xaxis.set_major_locator(plt.MaxNLocator(20))
plt.xticks(rotation=90)
plt.title('Development of Flow Rate')
plt.grid()
plt.show()

In [None]:
# plot (additive) changes of flow rate
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_lake_b.Date, df_lake_b.Flow_Rate.diff(), c='darkgreen')
ax.xaxis.set_major_locator(plt.MaxNLocator(20))
plt.xticks(rotation=90)
plt.title('Development of Flow Rate - Day to Day differences')
plt.grid()
plt.show()

In [None]:
# evaluation by year
df_lake_by_year = df_lake_b.groupby(['Year']).agg(
     LakeLevelMean = ('Lake_Level','mean'),
     FlowRateMean = ('Flow_Rate','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(df_lake_by_year.Year, df_lake_by_year.LakeLevelMean)
plt.grid()
plt.title('Lake Level by Year')
plt.show()

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(df_lake_by_year.Year, df_lake_by_year.FlowRateMean)
plt.grid()
plt.title('Flow Rate by Year')
plt.show()

In [None]:
# evaluation by month
df_lake_by_month = df_lake_b.groupby(['Month']).agg(
     LakeLevelMean = ('Lake_Level','mean'),
     FlowRateMean = ('Flow_Rate','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(df_lake_by_month.Month, df_lake_by_month.LakeLevelMean)
plt.grid()
plt.title('Lake Level by Month')
plt.show()

fig, ax = plt.subplots(figsize=(7,5))
ax.plot(df_lake_by_month.Month, df_lake_by_month.FlowRateMean)
plt.grid()
plt.title('Flow Rate by Month')
plt.show()

<a id="2"></a>
# 2. River Arno

Description: Arno is the second largest river in peninsular Italy and the main waterway in Tuscany and it has a relatively torrential regime, due to the nature of the surrounding soils (marl and impermeable clays). Arno results to be the main source of water supply of the metropolitan area of Florence-Prato-Pistoia. The availability of water for this waterbody is evaluated by checking the hydrometric level of the river at the section of Nave di Rosano.

Output: Hydrometry_Nave_di_Rosano

In [None]:
# load
df_river = pd.read_csv('../input/acea-water-prediction/River_Arno.csv')
df_river.tail()

In [None]:
# dimensions
n_river = df_river.shape[0]
df_river.shape

In [None]:
# convert date
df_river.Date = pd.to_datetime(df_river.Date, dayfirst=True)
# add year and month
df_river['Month'] = df_river.Date.dt.month.astype('int')
df_river['Year'] = df_river.Date.dt.year.astype('int')
# summary
df_river.describe(include='all', datetime_is_numeric=True)

In [None]:
# show structure of missings
msno.matrix(df_river)
plt.show()

In [None]:
# define features
features_river = ['Rainfall_Le_Croci', 'Rainfall_Cavallina', 'Rainfall_S_Agata',
                  'Rainfall_Mangona', 'Rainfall_S_Piero', 'Rainfall_Vernio',
                  'Rainfall_Stia', 'Rainfall_Consuma', 'Rainfall_Incisa',
                  'Rainfall_Montevarchi', 'Rainfall_S_Savino', 'Rainfall_Laterina',
                  'Rainfall_Bibbiena', 'Rainfall_Camaldoli', 'Temperature_Firenze',
                  'Hydrometry_Nave_di_Rosano']

# > Distributions and correlation (River Arno)

In [None]:
# plot distributions for all features
for f in features_river:
    perc_missing = np.round(100*sum(df_river[f].isna()) / n_river,4)
    plt.figure(figsize=(10,4))
    df_river[f].plot(kind='hist', bins=25)
    plt.title(f + ' - Missing %: '+ str(perc_missing))
    plt.grid()
    plt.show()

In [None]:
# pairwise scatterplot (this takes some time)
t1 = time.time()
sns.pairplot(df_river[features_river])
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# correlations
corr_pearson = df_river.corr(method='pearson')
corr_spearman = df_river.corr(method='spearman')

In [None]:
# plot Pearson correlation
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson correlation')
plt.show()

In [None]:
# plot Spearman correlation
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson correlation')
plt.title('Spearman (rank) correlation')
plt.show()

# > Development over time (River Arno)

In [None]:
# plot rainfall
my_alpha = 0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_river.Date, df_river.Rainfall_Bibbiena , alpha=my_alpha, label='Bibbiena')
ax.scatter(df_river.Date, df_river.Rainfall_Camaldoli, alpha=my_alpha, label='Camaldoli')
ax.scatter(df_river.Date, df_river.Rainfall_Cavallina, alpha=my_alpha, label='Cavallina')
ax.scatter(df_river.Date, df_river.Rainfall_Consuma, alpha=my_alpha, label='Consuma')
ax.scatter(df_river.Date, df_river.Rainfall_Incisa, alpha=my_alpha, label='Incisa')
ax.scatter(df_river.Date, df_river.Rainfall_Laterina , alpha=my_alpha, label='Laterina')
ax.scatter(df_river.Date, df_river.Rainfall_Le_Croci, alpha=my_alpha, label='Le_Croci')
ax.scatter(df_river.Date, df_river.Rainfall_Mangona, alpha=my_alpha, label='Mangona')
ax.scatter(df_river.Date, df_river.Rainfall_Montevarchi, alpha=my_alpha, label='Montevarchi')
ax.scatter(df_river.Date, df_river.Rainfall_S_Agata, alpha=my_alpha, label='S_Agata')
ax.scatter(df_river.Date, df_river.Rainfall_S_Piero, alpha=my_alpha, label='S_Piero')
ax.scatter(df_river.Date, df_river.Rainfall_S_Savino, alpha=my_alpha, label='S_Savino')
ax.scatter(df_river.Date, df_river.Rainfall_Stia, alpha=my_alpha, label='Stia')
ax.scatter(df_river.Date, df_river.Rainfall_Vernio, alpha=my_alpha, label='Vernio')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
ax.legend(loc='upper left')
plt.show()

In [None]:
# plot temperature (Firenze)
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_river.Date, df_river.Temperature_Firenze)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Temperature [Firenze]')
plt.grid()
plt.show()

In [None]:
# plot hydrometry (Nave di Rosano)
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_river.Date, df_river.Hydrometry_Nave_di_Rosano)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Hydrometry [Nave di Rosano]')
plt.grid()
plt.show()

In [None]:
# plot (additive) change of hydrometry (Nave di Rosano)
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_river.Date, df_river.Hydrometry_Nave_di_Rosano.diff(), c='darkgreen')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Hydrometry [Nave di Rosano] - Day to Day difference')
plt.grid()
plt.show()

In [None]:
# evaluation by year
df_river_by_year = df_river.groupby(['Year']).agg(
     HydrometryMean = ('Hydrometry_Nave_di_Rosano','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(df_river_by_year.Year, df_river_by_year.HydrometryMean)
plt.grid()
plt.title('Hydrometry [Nave di Rosano] by Year')
plt.show()

In [None]:
# evaluation by month
df_river_by_month = df_river.groupby(['Month']).agg(
     HydrometryMean = ('Hydrometry_Nave_di_Rosano','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(df_river_by_month.Month, df_river_by_month.HydrometryMean)
plt.grid()
plt.title('Hydrometry [Nave di Rosano] by Month')
plt.show()

<a id="3"></a>
# 3. Aquifer Luco

Description: The Luco wells field is fed by an underground aquifer. This aquifer not fed by rivers or lakes but by meteoric infiltration at the extremes of the impermeable sedimentary layers. Such aquifer is accessed through wells called Well 1, Well 3 and Well 4 (Pozzo_1, Pozzo_3 and Pozzo_4) and is influenced by the following parameters: rainfall, depth to groundwater, temperature and drainage volumes.

Outpus: Depth_to_Groundwater_Podere_Casetta

In [None]:
# load data
df_aqui_luco = pd.read_csv('../input/acea-water-prediction/Aquifer_Luco.csv')
df_aqui_luco.tail()

In [None]:
# dimensions
n_aqui_luco = df_aqui_luco.shape[0]
df_aqui_luco.shape

In [None]:
# convert date
df_aqui_luco.Date = pd.to_datetime(df_aqui_luco.Date, dayfirst=True)
# and add year and month
df_aqui_luco['Month'] = df_aqui_luco.Date.dt.month.astype('int')
df_aqui_luco['Year'] = df_aqui_luco.Date.dt.year.astype('int')
# summary
df_aqui_luco.describe(include='all', datetime_is_numeric=True)

In [None]:
# show structure of missings
msno.matrix(df_aqui_luco)
plt.show()

In [None]:
# define features
features_aqui_luco = ['Rainfall_Simignano', 'Rainfall_Siena_Poggio_al_Vento',
       'Rainfall_Mensano', 'Rainfall_Montalcinello',
       'Rainfall_Monticiano_la_Pineta', 'Rainfall_Sovicille',
       'Rainfall_Ponte_Orgia', 'Rainfall_Scorgiano', 'Rainfall_Pentolina',
       'Rainfall_Monteroni_Arbia_Biena', 'Depth_to_Groundwater_Podere_Casetta',
       'Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_3',
       'Depth_to_Groundwater_Pozzo_4', 'Temperature_Siena_Poggio_al_Vento',
       'Temperature_Mensano', 'Temperature_Pentolina',
       'Temperature_Monteroni_Arbia_Biena', 'Volume_Pozzo_1', 'Volume_Pozzo_3',
       'Volume_Pozzo_4']

# > Distributions and correlation (Aquifer Luco)

In [None]:
# plot distributions for all features
for f in features_aqui_luco:
    perc_missing = np.round(100*sum(df_aqui_luco[f].isna()) / n_aqui_luco,4)
    plt.figure(figsize=(10,4))
    df_aqui_luco[f].plot(kind='hist', bins=25)
    plt.title(f + ' - Missing %: '+ str(perc_missing))
    plt.grid()
    plt.show()

In [None]:
# pairwise scatterplot (this takes some time)
t1 = time.time()
sns.pairplot(df_aqui_luco[features_aqui_luco])
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# correlations
corr_pearson = df_aqui_luco.corr(method='pearson')
corr_spearman = df_aqui_luco.corr(method='spearman')

In [None]:
# plot Pearson correlation
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_pearson, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson correlation')
plt.title('Pearson correlation')
plt.show()

In [None]:
# plot Spearman correlation
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_spearman, annot=False, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman (rank) correlation')
plt.show()

# > Development over time (Aquifer Luco)

In [None]:
# plot rainfall
my_alpha = 0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Mensano , alpha=my_alpha, label='Mensano')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Montalcinello , alpha=my_alpha, label='Montalcinello')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Monteroni_Arbia_Biena, alpha=my_alpha, label='Monteroni Arbia Biena')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Monticiano_la_Pineta , alpha=my_alpha, label='Monticiano la Pineta')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Pentolina , alpha=my_alpha, label='Pentolina')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Ponte_Orgia , alpha=my_alpha, label='Ponte Orgia')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Scorgiano , alpha=my_alpha, label='Scorgiano')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Siena_Poggio_al_Vento , alpha=my_alpha, label='Siena Poggio al Vento')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Simignano , alpha=my_alpha, label='Simignano')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Rainfall_Sovicille , alpha=my_alpha, label='Sovicille')

ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
ax.legend(loc='upper left')
plt.show()

In [None]:
# plot temperatures (here we have 4 locations)
my_alpha = 0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Temperature_Mensano, alpha=my_alpha, label='Mensano')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Temperature_Monteroni_Arbia_Biena, alpha=my_alpha, label='Monteroni Arbia Biena')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Temperature_Pentolina, alpha=my_alpha, label='Pentolina')
ax.scatter(df_aqui_luco.Date, df_aqui_luco.Temperature_Siena_Poggio_al_Vento, alpha=my_alpha, label='Siena Poggio al Vento')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Temperatures')
plt.grid()
ax.legend(loc='upper left')
plt.show()

In [None]:
# plot target
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_aqui_luco.Date, df_aqui_luco.Depth_to_Groundwater_Podere_Casetta)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Depth to Groundwater Podere Casetta')
plt.grid()
plt.show()

We see quite a lot of missing values here!

In [None]:
# plot (additive) change of target
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_aqui_luco.Date, df_aqui_luco.Depth_to_Groundwater_Podere_Casetta.diff(), c='darkgreen')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Depth to Groundwater Podere Casetta')
plt.grid()
plt.show()

In [None]:
# evaluation by year
df_aqui_loco_by_year = df_aqui_luco.groupby(['Year']).agg(
     DepthGWMean = ('Depth_to_Groundwater_Podere_Casetta','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(df_aqui_loco_by_year.Year, df_aqui_loco_by_year.DepthGWMean)
plt.grid()
plt.title('Depth to Groundwater Podere Casetta by Year')
plt.show()

In [None]:
# evaluation by month
df_aqui_loco_by_month = df_aqui_luco.groupby(['Month']).agg(
     DepthGWMean = ('Depth_to_Groundwater_Podere_Casetta','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(df_aqui_loco_by_month.Month, df_aqui_loco_by_month.DepthGWMean)
plt.grid()
plt.title('Depth to Groundwater Podere Casetta by Month')
plt.show()

<a id="4"></a>
# 4. Water Spring Amiata

Description: The Amiata waterbody is composed of a volcanic aquifer not fed by rivers or lakes but fed by meteoric infiltration. This aquifer is accessed through Ermicciolo, Arbure, Bugnano and Galleria Alta water springs. The levels and volumes of the four sources are influenced by the parameters: rainfall, depth to groundwater, hydrometry, temperatures and drainage volumes.

Outputs: Flow_Rate_Bugnano, Flow_Rate_Arbure, Flow_Rate_Ermicciolo, Flow_Rate_Galleria_Alta

In [None]:
# load data
df_spring_amiata = pd.read_csv('../input/acea-water-prediction/Water_Spring_Amiata.csv')
df_spring_amiata.tail()

In [None]:
# dimensions
n_spring_amiata = df_spring_amiata.shape[0]
df_spring_amiata.shape

In [None]:
# convert date
df_spring_amiata.Date = pd.to_datetime(df_spring_amiata.Date, dayfirst=True)
# and add year and month
df_spring_amiata['Month'] = df_spring_amiata.Date.dt.month.astype('int')
df_spring_amiata['Year'] = df_spring_amiata.Date.dt.year.astype('int')
# summary
df_spring_amiata.describe(include='all', datetime_is_numeric=True)

In [None]:
# show structure of missings
msno.matrix(df_spring_amiata)
plt.show()

In [None]:
# define features
features_spring_amiata = ['Rainfall_Castel_del_Piano', 'Rainfall_Abbadia_S_Salvatore',
                          'Rainfall_S_Fiora', 'Rainfall_Laghetto_Verde', 'Rainfall_Vetta_Amiata',
                          'Depth_to_Groundwater_S_Fiora_8', 'Depth_to_Groundwater_S_Fiora_11bis',
                          'Depth_to_Groundwater_David_Lazzaretti',
                          'Temperature_Abbadia_S_Salvatore', 'Temperature_S_Fiora',
                          'Temperature_Laghetto_Verde', 'Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
                          'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta']

# > Distributions and correlation

In [None]:
# plot distributions for all features
for f in features_spring_amiata:
    perc_missing = np.round(100*sum(df_spring_amiata[f].isna()) / n_spring_amiata,4)
    plt.figure(figsize=(10,4))
    df_spring_amiata[f].plot(kind='hist', bins=25)
    plt.title(f + ' - Missing %: '+ str(perc_missing))
    plt.grid()
    plt.show()

In [None]:
# pairwise scatterplot (this takes some time)
t1 = time.time()
sns.pairplot(df_spring_amiata[features_spring_amiata])
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# correlations
corr_pearson = df_spring_amiata.corr(method='pearson')
corr_spearman = df_spring_amiata.corr(method='spearman')

In [None]:
# plot Pearson correlation
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson correlation')
plt.show()

In [None]:
# plot Spearman correlation
fig = plt.figure(figsize = (12,9))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman (rank) correlation')
plt.show()

# > Development over time (Water Spring Amiata)

In [None]:
# plot rainfalls
my_alpha = 0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Rainfall_Abbadia_S_Salvatore, alpha=my_alpha, label='Abbadia S Salvatore')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Rainfall_Castel_del_Piano, alpha=my_alpha, label='Castel del Piano')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Rainfall_Laghetto_Verde, alpha=my_alpha, label='Laghetto Verde')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Rainfall_S_Fiora, alpha=my_alpha, label='S Fiora')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Rainfall_Vetta_Amiata, alpha=my_alpha, label='Vetta Amiata')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
ax.legend(loc='upper left')
plt.show()

In [None]:
# plot temperatures
my_alpha = 0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Temperature_Abbadia_S_Salvatore, alpha=my_alpha, label='Abbadia S Salvatore')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Temperature_Laghetto_Verde, alpha=my_alpha, label='Laghetto Verde')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Temperature_S_Fiora, alpha=my_alpha, label='S Fiora')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Temperatures')
plt.grid()
ax.legend(loc='lower left')
plt.show()

In [None]:
# plot depths to groundwater
my_alpha = 0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Depth_to_Groundwater_David_Lazzaretti, alpha=my_alpha, label='David Lazzaretti')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Depth_to_Groundwater_S_Fiora_11bis, alpha=my_alpha, label='S Fiora 11bis')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Depth_to_Groundwater_S_Fiora_8, alpha=my_alpha, label='S Fiora 8')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Depth to Groundwater')
plt.grid()
ax.legend(loc='center left')
plt.show()

In [None]:
# plot flow rates (we have 4 outputs here)
my_alpha = 0.5
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Flow_Rate_Arbure, alpha=my_alpha, label='Arbure')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Flow_Rate_Bugnano, alpha=my_alpha, label='Bugnano')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Flow_Rate_Ermicciolo, alpha=my_alpha, label='Ermicciolo')
ax.scatter(df_spring_amiata.Date, df_spring_amiata.Flow_Rate_Galleria_Alta, alpha=my_alpha, label='Galleria Alta')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labes
plt.xticks(rotation=90)
plt.title('Development of Flow Rates')
plt.grid()
ax.legend(loc='center left')
plt.show()

In [None]:
# evaluation by year
df_spring_amiata_by_year = df_spring_amiata.groupby(['Year']).agg(
    MeanFlowRate_Arbure = ('Flow_Rate_Arbure','mean'),
    MeanFlowRate_Bugnano = ('Flow_Rate_Bugnano','mean'),
    MeanFlowRate_Ermicciolo = ('Flow_Rate_Ermicciolo','mean'),
    MeanFlowRate_Galleria_Alta = ('Flow_Rate_Galleria_Alta','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(df_spring_amiata_by_year.Year, df_spring_amiata_by_year.MeanFlowRate_Arbure, label='Arbure')
ax.plot(df_spring_amiata_by_year.Year, df_spring_amiata_by_year.MeanFlowRate_Bugnano, label='Bugnano')
ax.plot(df_spring_amiata_by_year.Year, df_spring_amiata_by_year.MeanFlowRate_Ermicciolo, label='Ermicciolo')
ax.plot(df_spring_amiata_by_year.Year, df_spring_amiata_by_year.MeanFlowRate_Galleria_Alta, label='Galleria Alta')
plt.grid()
plt.title('Flow Rates by Year')
ax.legend(loc='center left')
plt.show()

In [None]:
# evaluation by month
df_spring_amiata_by_month = df_spring_amiata.groupby(['Month']).agg(
    MeanFlowRate_Arbure = ('Flow_Rate_Arbure','mean'),
    MeanFlowRate_Bugnano = ('Flow_Rate_Bugnano','mean'),
    MeanFlowRate_Ermicciolo = ('Flow_Rate_Ermicciolo','mean'),
    MeanFlowRate_Galleria_Alta = ('Flow_Rate_Galleria_Alta','mean')
 ).reset_index()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(df_spring_amiata_by_month.Month, df_spring_amiata_by_month.MeanFlowRate_Arbure, label='Arbure')
ax.plot(df_spring_amiata_by_month.Month, df_spring_amiata_by_month.MeanFlowRate_Bugnano, label='Bugnano')
ax.plot(df_spring_amiata_by_month.Month, df_spring_amiata_by_month.MeanFlowRate_Ermicciolo, label='Ermicciolo')
ax.plot(df_spring_amiata_by_month.Month, df_spring_amiata_by_month.MeanFlowRate_Galleria_Alta, label='Galleria Alta')
plt.grid()
plt.title('Flow Rates by Month')
ax.legend(loc='center left')
plt.show()

In [None]:
# pairwise scatterplot of outputs (flow rates)
t1 = time.time()
sns.pairplot(df_spring_amiata[['Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
                  'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta']])
plt.show()
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# correlations of flow rates
corr_out_pearson = df_spring_amiata[['Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
                  'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta']].corr(method='pearson')
corr_out_spearman = df_spring_amiata[['Flow_Rate_Bugnano', 'Flow_Rate_Arbure',
                  'Flow_Rate_Ermicciolo', 'Flow_Rate_Galleria_Alta']].corr(method='spearman')

# plot correlation matrices
fig = plt.figure(figsize = (18,8))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

# plot side by side (use vmin and vmax to sync the colors!)
sns.heatmap(data=corr_out_pearson, ax=ax1, cbar=1, square=True, vmin=-1, vmax=1, cmap='RdYlGn', cbar_kws={'shrink': .3}, annot=True)
sns.heatmap(data=corr_out_spearman, ax=ax2, cbar=1, square=True, vmin=-1, vmax=1, cmap='RdYlGn', cbar_kws={'shrink': .3}, annot=True)
plt.show()

<a id="5"></a>
# 5. Combine River Arno and Lake Bilancino data

In [None]:
# combine dataframes
df_combo = pd.merge(df_river, df_lake_b, on='Date', suffixes=(None,'_y'))
# remove redundancies
df_combo = df_combo.drop(['Rainfall_S_Piero_y', 'Rainfall_Mangona_y', 'Rainfall_S_Agata_y',
                          'Rainfall_Cavallina_y', 'Rainfall_Le_Croci_y',
                          'Month_y','Year_y'], axis=1)
# show columns of combined dataframe
print(list(df_combo.columns))

In [None]:
# show structure of missings
msno.matrix(df_combo)
plt.show()

# > Look at targets

In [None]:
# select "targets" (outputs)
targets = ['Hydrometry_Nave_di_Rosano','Lake_Level', 'Flow_Rate']
selection = ['Date','Month','Year'] + targets
# and perform pairwise scatterplot
sns.pairplot(df_combo[targets])
plt.show()

In [None]:
# interactive plot using color and size for additional dimensions
df_combo_clean = df_combo[~df_combo.Flow_Rate.isna()]
fig = px.scatter(df_combo_clean, x='Lake_Level', y='Hydrometry_Nave_di_Rosano', 
                 size='Flow_Rate',
                 color='Month', opacity=0.5)
fig.update_layout(title='Hydrometry [Nave_di_Rosano] vs Lake Level')
fig.show()

In [None]:
# correlations
corr_pearson = df_combo[targets].corr(method='pearson')
corr_spearman = df_combo[targets].corr(method='spearman')

# plot correlation matrices
fig = plt.figure(figsize = (16,8))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

# plot side by side (use vmin and vmax to sync the colors!)
sns.heatmap(data=corr_pearson, ax=ax1, cbar=1, square=True, vmin=-1, vmax=1, cmap='RdYlGn', cbar_kws={'shrink': .3}, annot=True)
sns.heatmap(data=corr_spearman, ax=ax2, cbar=1, square=True, vmin=-1, vmax=1, cmap='RdYlGn', cbar_kws={'shrink': .3}, annot=True)
plt.show()

In [None]:
# plot development over time
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_combo.Date, df_combo.Hydrometry_Nave_di_Rosano, alpha=0.5, label='Hydrometry_Nave_di_Rosano')
ax.scatter(df_combo.Date, df_combo.Lake_Level, alpha=0.5, label='Lake_Level')
ax.scatter(df_combo.Date, df_combo.Flow_Rate, alpha=0.5, label='Flow_Rate')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
ax.legend(loc='center left')
plt.show()

#### The targets have very different value ranges. To get a nicer plot let's standardize the targets before plotting.

In [None]:
# add standardized versions of targets to data frame
for t in targets:
    x = df_combo[t].values
    new_col = 'STD_' + t
    df_combo[new_col] = (x - np.nanmean(x)) / np.nanstd(x) # ignore NaNs for mean / std calc!

In [None]:
# plot development over time for standardized targets
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_combo.Date, df_combo.STD_Hydrometry_Nave_di_Rosano, alpha=0.5, label='Hydrometry_Nave_di_Rosano [std]')
ax.scatter(df_combo.Date, df_combo.STD_Lake_Level, alpha=0.5, label='Lake_Level [std]')
ax.scatter(df_combo.Date, df_combo.STD_Flow_Rate, alpha=0.5, label='Flow_Rate [std]')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
ax.legend(loc='upper left')
plt.show()

#### Zoom in to the last 3 years only:

In [None]:
df_combo_recent = df_combo[-365*3:]

# plot development over time for standardized targets
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(df_combo_recent.Date, df_combo_recent.STD_Hydrometry_Nave_di_Rosano, alpha=0.5, label='Hydrometry_Nave_di_Rosano [std]')
ax.scatter(df_combo_recent.Date, df_combo_recent.STD_Lake_Level, alpha=0.5, label='Lake_Level [std]')
ax.scatter(df_combo_recent.Date, df_combo_recent.STD_Flow_Rate, alpha=0.5, label='Flow_Rate [std]')
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
plt.xticks(rotation=90)
plt.grid()
ax.legend(loc='upper left')
plt.show()

# > Look at features

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
plt.scatter(df_combo.Temperature_Firenze, df_combo.Temperature_Le_Croci,
            c='blue', alpha=0.1)
plt.title('Temperature Le Croci vs. Firenze')
plt.xlabel('Temperature Firenze')
plt.ylabel('Temperature Le Croci')
plt.grid()
plt.show()

In [None]:
# add temperature difference
df_combo['DiffTemp'] = df_combo.Temperature_Le_Croci - df_combo.Temperature_Firenze
# and plot time series
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_combo.Date, df_combo.DiffTemp)
plt.title('Temperature Difference Le Croci vs. Firenze')
plt.xlabel('Date')
plt.ylabel('Temperature Difference')
plt.grid()
plt.show()

In [None]:
# plot target lake level vs. temperature difference
cor_temp = round(df_combo.DiffTemp.corr(df_combo.Lake_Level),4)
fig, ax = plt.subplots(figsize=(10,7))
ax.scatter(df_combo.DiffTemp, df_combo.Lake_Level, c='blue', alpha=0.15)
plt.title('Lake Level vs. Temperature Difference - Correlation:' + str(cor_temp))
plt.xlabel('Temperature Difference')
plt.ylabel('Lake Level')
plt.grid()
plt.show()

#### We can see a moderate negative correlation.

In [None]:
# make combined data available for download
df_combo.to_csv('df_combo.csv')