In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        continue

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from urllib import request
from itertools import product
import pickle

import scipy.stats as ss
import missingno as msno

%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import HTML
import seaborn as sns
import cufflinks as cf
import plotly
import plotly.express as px
import plotly.graph_objects as go


from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)

# import plotly.offline
# cf.go_offline()
# cf.set_config_file(offline=False, world_readable=True)
pd.options.plotting.backend =  'matplotlib'#"plotly"
def plot(fig):
    return HTML(fig.to_html())


import IPython
def display(*dfs):
    for df in dfs:
        IPython.display.display(df)

In [None]:
!python -m pip install --upgrade pip
!pip install -U scikit-learn

# Datasets

In [None]:
auser = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Auser.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()
petrignano = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Petrignano.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()
doganella = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Doganella.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()
luco = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Luco.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()

In [None]:
rainfall_windows = pickle.load(open('/kaggle/input/water-italy-aquifers-windows-for-ewm/rainfall_windows.pkl', 'rb'))
volume_windows = pickle.load(open('/kaggle/input/water-italy-aquifers-windows-for-ewm/volume_windows.pkl', 'rb'))
temperature_windows = pickle.load(open('/kaggle/input/water-italy-aquifers-windows-for-ewm/temperature_windows.pkl', 'rb'))
hydrometry_windows = pickle.load(open('/kaggle/input/water-italy-aquifers-windows-for-ewm/hydro_windows.pkl', 'rb'))
depth_windows = pickle.load(open('/kaggle/input/water-italy-aquifers-windows-for-ewm/depth_windows.pkl', 'rb'))


depth = []
for df, name in zip([auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    depth.extend([f for f in df.columns if 'Depth' in f ])
    
assert len(rainfall_windows) == len(depth)
assert len(volume_windows) == len(depth)
assert len(temperature_windows) == len(depth)
assert len(hydrometry_windows) == 7
assert len(depth_windows) == 4

In [None]:
def get_column_category(x):
    if 'Date' in x:
        return 'Date'
    elif 'Rainfall' in x:
        return 'Rainfall'
    elif 'Depth' in x:
        return 'Depth to Groundwater'
    elif 'Temperature' in x:
        return 'Temperature'
    elif 'Volume' in x:
        return 'Volume'
    elif 'Hydrometry' in x:
        return 'Hydrometry'
    elif 'Lake_Level' in x:
        return 'Lake Level'
    elif 'Flow_Rate' in x:
        return 'Flow Rate'
    else:
        return x

temp_df = pd.DataFrame({'column_name' : auser.columns, 'waterbody_type':'auser'})
temp_df = temp_df.append(pd.DataFrame({'column_name' : doganella.columns, 'waterbody_type':'doganella'}))
temp_df = temp_df.append(pd.DataFrame({'column_name' : luco.columns, 'waterbody_type':'luco'}))
temp_df = temp_df.append(pd.DataFrame({'column_name' : petrignano.columns, 'waterbody_type':'petrignano'}))
# temp_df = temp_df.append(pd.DataFrame({'column_name' : lake_biliancino_df.columns, 'waterbody_type':'Lake Biliancino'}))
# temp_df = temp_df.append(pd.DataFrame({'column_name' : river_arno_df.columns, 'waterbody_type':'River Arno'}))
# temp_df = temp_df.append(pd.DataFrame({'column_name' : water_spring_amiata_df.columns, 'waterbody_type':'Water Spring Amiata'}))
# temp_df = temp_df.append(pd.DataFrame({'column_name' : water_spring_lupa_df.columns, 'waterbody_type':'Water Spring Lupa'}))
# temp_df = temp_df.append(pd.DataFrame({'column_name' : water_spring_madonna_df.columns, 'waterbody_type':'Water Spring Madonna'}))

temp_df['column_category'] = temp_df.column_name.apply(lambda x: get_column_category(x))

temp_df = temp_df.groupby('waterbody_type').column_category.value_counts().to_frame()
temp_df.columns = ['counts']
temp_df = temp_df.reset_index(drop=False)
temp_df = temp_df.pivot(index='waterbody_type', columns='column_category')['counts']
temp_df['n_features'] = temp_df.sum(axis=1)
temp_df['n_predict'] = [3, 9, 1, 2]
temp_df['fed'] = [None, 'meteoric infiltration', 'meteoric infiltration', 'Chiascio river']

f, ax = plt.subplots(1,1, figsize=(12, 5))
sns.heatmap(temp_df.drop('fed', axis=1), cmap='Blues', linewidth=1, ax=ax, vmin=0, vmax=10, annot=True)
ax.set_ylabel('')
ax.set_xlabel('')
ax.set_title('Features, Number of Columns and Target Variables', fontsize=16)
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(14) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14)
    tick.label.set_rotation(45)

In [None]:
# type and whit's fed for each aquifer - from datasets info

aquifer_type = pd.DataFrame([{'aquifer': 'auser', 'name': 'SAL', 'type': 'unconfined'}, # 1
                {'aquifer': 'auser', 'name': 'PAG', 'type': 'unconfined'}, # 2
                {'aquifer': 'auser', 'name': 'COS', 'type': 'unconfined'}, # 3
                {'aquifer': 'auser', 'name': 'DIEC', 'type': 'unconfined'}, # 4
                {'aquifer': 'auser', 'name': 'LT2', 'type': 'confined'},   # 5
                
                {'aquifer': 'petrignano', 'name': 'P24', 'type': 'unconfined'}, 
                {'aquifer': 'petrignano', 'name': 'P25', 'type': 'unconfined'}, 
                
                {'aquifer': 'doganella', 'name': 'Pozzo_1', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_2', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_3', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_4', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_5', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_6', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_7', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_8', 'type': 'semi-confined'},
                {'aquifer': 'doganella', 'name': 'Pozzo_9', 'type': 'semi-confined'},
                
                {'aquifer': 'luco', 'name': 'Podere_Casetta', 'type': None},
                {'aquifer': 'luco', 'name': 'Pozzo_1', 'type': None},
                {'aquifer': 'luco', 'name': 'Pozzo_3', 'type': None},
                {'aquifer': 'luco', 'name': 'Pozzo_4', 'type': None}
               ]
                           )
aquifer_type = aquifer_type.merge(temp_df.fed, left_on='aquifer',right_index=True, how='outer')
aquifer_type['features'] = aquifer_type.apply(lambda x: [df.columns for (df,name) in zip([auser, doganella, luco, petrignano],
                                                                                         ['auser', 'doganella', 'luco', 'petrignano'])
                                                         if x.aquifer == name][0],
                                              axis=1)

def color_none_red(val):
    color = 'red' if val is None else 'black'
    return 'color: %s' % color

aquifer_type.iloc[:, :-1].style.applymap(color_none_red)

In [None]:
aquifer_type.drop_duplicates(['type', 'fed'])

# Location features map

In [None]:
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Nominatim
import folium

locations = {}

locations['Settefrati'] = {'lat' : 41.669624, 'lon' : 13.850011 }
locations['Velletri'] = {'lat' : 41.6867015, 'lon' : 12.7770433 }
locations['Petrignano'] = {'lat' : 43.1029282, 'lon' : 12.5237369 }
locations['Piaggione'] = {'lat' : 43.936794, 'lon' : 10.5040929 }
locations['S_Fiora'] = {'lat' : 42.854, 'lon' : 11.556 }
locations['Abbadia_S_Salvatore'] = {'lat' : 42.8809724, 'lon' : 11.6724203 }
locations['Vetta_Amiata'] = {'lat' : 42.8908958, 'lon' : 11.6264863 }
locations['Castel_del_Piano'] = {'lat' : 42.8932352, 'lon' : 11.5383804 }
locations['Terni'] = {'lat' : 42.6537515, 'lon' : 12.43981163 }
locations['Bastia_Umbra'] = {'lat' : 43.0677554, 'lon' : 12.5495816  }
locations['S_Savino'] = {'lat' : 43.339, 'lon' : 11.742 }
locations['Monteroni_Arbia_Biena'] = {'lat' : 43.228279, 'lon' : 11.4021433 }
locations['Monticiano_la_Pineta'] = {'lat' : 43.1335066 , 'lon' : 11.2408464 }
locations['Montalcinello'] = {'lat' : 43.1978783, 'lon' : 11.0787906 }
locations['Sovicille'] = {'lat' : 43.2806018, 'lon' : 11.2281756 }
locations['Simignano'] = {'lat' : 43.2921965, 'lon' : 11.1680079 }
locations['Mensano'] = {'lat' : 43.3009594 , 'lon' : 11.0548528 }
locations['Siena_Poggio_al_Vento'] = {'lat' : 43.1399762, 'lon' : 11.3832092 }
locations['Scorgiano'] = {'lat' : 43.3521445 , 'lon' : 11.15867 }
locations['Ponte_Orgia'] = {'lat' : 43.2074581 , 'lon' : 11.2504416 }
locations['Pentolina'] = {'lat' : 43.1968029, 'lon' : 11.1754672 }
locations['Montevarchi'] = {'lat' : 43.5234999, 'lon' : 11.5675911 }
locations['Incisa'] = {'lat' : 43.6558723, 'lon' : 11.4526838 }
locations['Camaldoli'] = {'lat' : 43.7943293, 'lon' : 11.8199481 }
locations['Bibbiena'] = {'lat' : 43.6955475, 'lon' : 11.817341 }
locations['Stia'] = {'lat' : 43.801537, 'lon' : 11.7067347 }
locations['Laterina'] = {'lat' : 43.5081823, 'lon' : 11.7102588 }
locations['Monteporzio'] = {'lat' : 41.817251, 'lon' : 12.7050839 }
locations['Pontetetto'] = {'lat' : 43.8226294, 'lon' : 10.4940843 }
locations['Ponte_a_Moriano'] = {'lat' : 43.9083609 , 'lon' : 10.5342488 }
locations['Calavorno'] = {'lat' : 44.0217216, 'lon' : 10.5297323 }
locations['Borgo_a_Mozzano'] = {'lat' : 43.978948, 'lon' : 10.545703  }
locations['Gallicano'] = {'lat' : 44.0606512, 'lon' : 10.435668  }
locations['Tereglio_Coreglia_Antelminelli'] = {'lat' : 44.0550548 , 'lon' : 10.5623594 }
locations['Lucca_Orto_Botanico'] = {'lat' : 43.84149865, 'lon' : 10.51169066 }
locations['Orentano'] = {'lat' : 43.7796506, 'lon' : 10.6583892 }
locations['Fabbriche_di_Vallico'] = {'lat' : 43.997647, 'lon' : 10.4279  }
locations['Monte_Serra'] = {'lat' : 43.750833, 'lon' : 10.555278 }
locations['Mangona'] = {'lat' : 44.0496863, 'lon' : 11.1958797 }
locations['Le_Croci'] = {'lat' : 44.0360503, 'lon' : 11.2675661 }
locations['Cavallina'] = {'lat' : 43.9833515, 'lon' : 11.2323312 }
locations['S_Agata'] = {'lat' : 43.9438247, 'lon' : 11.3089835 }
locations['Firenze'] = {'lat' : 43.7698712, 'lon' : 11.2555757 }
locations['S_Piero'] = {'lat' : 43.9637372, 'lon' : 11.3182991 }
locations['Vernio'] = {'lat' : 44.0440508 , 'lon' : 11.1498804  }
locations['Consuma'] = {'lat' : 43.784, 'lon' : 11.585 }
locations['Croce_Arcana']  = {'lat' : 44.1323056, 'lon' : 10.7689152 }
locations['Laghetto_Verde']  = {'lat' :   42.883, 'lon' : 11.662  }

locations_df = pd.DataFrame(columns=['city', 'lat', 'lon'] )

def get_location_coordinates(df, column_type, cluster, target_df):
    for location in df.columns[df.columns.str.startswith(column_type)]:
        location = location.split(column_type)[1]

        loc_dict = {}
        loc_dict['city'] = location
        loc_dict['cluster'] = cluster
        loc_dict['type'] = column_type[:-1]
        loc_dict['lat'] = locations[location]['lat']
        loc_dict['lon'] = locations[location]['lon']

        target_df = target_df.append(loc_dict, ignore_index=True)

    return target_df

locations_df = get_location_coordinates(auser, 'Temperature_', 'auser_df', locations_df)
locations_df = get_location_coordinates(auser, 'Rainfall_', 'auser_df', locations_df)

locations_df = get_location_coordinates(doganella, 'Temperature_', 'doganella_df', locations_df)
locations_df = get_location_coordinates(doganella, 'Rainfall_', 'doganella_df', locations_df)

locations_df = get_location_coordinates(luco, 'Temperature_', 'luco_df', locations_df)
locations_df = get_location_coordinates(luco, 'Rainfall_', 'luco_df', locations_df)

locations_df = get_location_coordinates(petrignano, 'Temperature_', 'petrignano_df', locations_df)
locations_df = get_location_coordinates(petrignano, 'Rainfall_', 'petrignano_df', locations_df)

# locations_df = get_location_coordinates(lake_biliancino_df, 'Temperature_', 'lake_biliancino_df', locations_df)
# locations_df = get_location_coordinates(lake_biliancino_df, 'Rainfall_', 'lake_biliancino_df', locations_df)

# locations_df = get_location_coordinates(river_arno_df, 'Temperature_', 'river_arno_df', locations_df)
# locations_df = get_location_coordinates(river_arno_df, 'Rainfall_', 'river_arno_df', locations_df)

# locations_df = get_location_coordinates(water_spring_amiata_df, 'Temperature_', 'water_spring_amiata_df', locations_df)
# locations_df = get_location_coordinates(water_spring_amiata_df, 'Rainfall_', 'water_spring_amiata_df', locations_df)

# locations_df = get_location_coordinates(water_spring_lupa_df, 'Temperature_', 'water_spring_lupa_df', locations_df)
# locations_df = get_location_coordinates(water_spring_lupa_df, 'Rainfall_', 'water_spring_lupa_df', locations_df)

# locations_df = get_location_coordinates(water_spring_madonna_df, 'Temperature_', 'water_spring_madonna_df', locations_df)
# locations_df = get_location_coordinates(water_spring_madonna_df, 'Rainfall_', 'water_spring_madonna_df', locations_df)

# Drop duplicates
locations_df = locations_df.sort_values(by='city').drop_duplicates().reset_index(drop=True)

# Label Encode cluster feature for visualization puposes
le = LabelEncoder()
le.fit(locations_df.cluster)
locations_df['cluster_enc'] = le.transform(locations_df.cluster)

In [None]:
m = folium.Map(location=[42.6, 12.4], tiles='cartodbpositron',zoom_start=7)

colors = ['purple','lightred','green', 'lightblue', 'red', 'blue', 'darkblue','lightgreen', 'orange',  'darkgreen', 'beige',  'pink', 'darkred', 'darkpurple', 'cadetblue',]
icons = {'Temperature': 'certificate',
        'Rainfall': 'cloud'}

geolocator = Nominatim(user_agent='myapplication')
for city, gr in locations_df.groupby('city'):
    if gr.shape[0] > 1: icon = 'th-list' 
    else: icon = icons[gr.iloc[0]['type']]
    folium.Marker([gr.iloc[0].lat, 
                  gr.iloc[0].lon],
                  popup=city, 
                  icon=folium.Icon(color=colors[gr.iloc[0].cluster_enc], icon=icon)).add_to(m)
    
m

# Get additional geo data
json file was  downloaded from official website https://www.sir.toscana.it/consistenza-rete

In [None]:
import json
geo_file = json.load(open('/kaggle/input/geo-data-water-italia/geo_data.json', 'rb'))['features']

geo_dict = {}
for el in geo_file:
    sea_level = el['description'].split()
    try:
        sea_level = float(sea_level[sea_level.index('[m]</b>')+1].replace('<br', ''))
    except:
        sea_level = None
    geo_dict[el['name']] = dict(ids=el['id'], lat=el['lat'], lon=el['lon'], latlon=(el['lat'], el['lon']), sea_level=sea_level)    

geo_data = []
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features = df.columns
    features = features.str.replace('Rainfall_', '').str.replace('Depth_to_Groundwater_', '')\
    .str.replace('Temperature_', '').str.replace('Volume_', '').str.replace('Hydrometry_', '')\
    .str.replace('_', ' ')
    features = features.str.replace('Tereglio Coreglia Antelminelli', 'Tereglio')\
                        .str.replace('Lucca Orto Botanico', 'Lucca (Orto Botanico)')\
                        .str.replace('Monte S Quirico', 'Monte S.Quirico')\
                        .str.replace('Rainfall_Monticiano_la_Pineta', 'Rainfall_Monticiano_La_Pineta')\
    
    
    for f in features:
        try:
            geo_data.append(dict(**dict(aquifer=name, name=f), **geo_dict[f]))
        except:
            continue
        
geo_data = pd.DataFrame(geo_data)
geo_data['name'] = geo_data['name'].str.replace('(','').str.replace(')', '').str.replace('.',' ')

for f in geo_data['name']:
    geo_data.loc[geo_data['name']==f, 'feature_name'] = [x for x in auser.columns.tolist() + doganella.columns.tolist() + luco.columns.tolist() + petrignano.columns.tolist() 
                                                         if f.replace(' ', '_') in x ]

geo_data['type'] = geo_data.feature_name.str.split('_').apply(lambda x: x[0])
geo_data

In [None]:
# def autolabel(xx, yy, names):
#     """Attach a text label above each bar in *rects*, displaying its height."""
#     for x, y, ann in zip(xx, yy, names):
#         ax.annotate('{}'.format(ann),
#                     xy=(x, y),
#                     xytext=(0, 3),  # 3 points vertical offset
#                     textcoords="offset points",
#                     ha='center', va='bottom')
        
        
# fig, ax = plt.subplots(figsize = (8,8))
# borders = (10.4, 10.8, 43.7, 44.2)
# ax.set_xlim(borders[0],borders[1])
# ax.set_ylim(borders[2],borders[3])
# for (i, gr), c, add in zip(geo_data.groupby('type'), ('black', 'r', 'b'), (0, 0.0025, 0.005)):
#     ax.scatter(gr.lon+add, gr.lat, zorder=1, alpha=0.5, c=c, s=30, label=i)
    
    
# autolabel(geo_data.lon, geo_data.lat, geo_data['name'])
# plt.imshow(plt.imread('/kaggle/input/geo-data-water-italia/map.png'), 
#            zorder=0, extent=borders, aspect='equal',
#            alpha=0.75)
# ax.legend(loc='upper left')

In [None]:
for df in [auser, doganella, luco, petrignano]:
    shift = df.reset_index().date - auser.reset_index().date.shift(1)
    assert shift.value_counts().shape[0] == 1

# Corr matrix

In [None]:
def plot_corr_all(features_pattern):
    if isinstance(features_pattern, str): features_pattern = [features_pattern]
    
    fig, axes = plt.subplots(2, 2, figsize=(15*2, 12*2))
    fig.subplots_adjust(wspace = 0.3, hspace=0.3)

    for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                           ['auser', 'doganella', 'luco', 'petrignano']):
        features = [f for f in df.columns if 'Depth' in f]
        for pattern in features_pattern:
            features.extend([f for f in df.columns if pattern in f])
            
        if features_pattern[0] == 'all':
            features = df.columns
        if name=='petrignano': annot, cbar = True, True
        else: annot, cbar = True, False
        sns.heatmap(df[features].corr(), annot=annot, ax=ax, cmap='coolwarm', cbar=cbar, vmin=-1, vmax=1)
        ax.set_title(name)
        
plot_corr_all('all')

# Missings

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    ax.set_title(name)
    msno.matrix(df, figsize=(10,8), fontsize=10, ax=ax)

# Time

In [None]:
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    print(f'{name}:\t{df.index.min().date()} - {df.index.max().date()}')

In [None]:
# features_rainfall = [f for f in auser.columns if 'Rainfall' in f]
# features_depth = [f for f in auser.columns if 'Depth' in f]
# features_volume = [f for f in auser.columns if 'Volume' in f]
# features_temp = [f for f in auser.columns if 'Temperature' in f]
# features_hydro = [f for f in auser.columns if 'Hydrometry' in f]

# start_date = auser[features_rainfall][auser[features_rainfall].isna().all(axis=1)]\
#                     .index[-1]
# auser = auser[auser.index > start_date]

In [None]:
def get_time_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month.astype('category')
    df['weekofyear'] = df.index.isocalendar().week.astype('category')
    df['day'] = df.index.day.astype('category')
    df['dayofyear'] = df.index.dayofyear.astype('category')
    df['quarter'] = df.index.quarter.astype('category')
    

In [None]:
get_time_features(auser)
get_time_features(doganella)
get_time_features(luco)
get_time_features(petrignano)

# Targets

In [None]:
def plot_data_by_year(auser, f, tresholds):
    col = 2
    row = auser.year.nunique() // 2 + 1
    fig, axes = plt.subplots(row, col ,figsize=(10*col,2*row),)
    for (y,gr), ax in zip(auser.groupby('year')[f], axes.ravel()):
        gr.plot(ax=ax, label=y)
        ax.legend()
        
    for th in tresholds:
        for (y,gr), ax in zip(auser.groupby('year')[f], axes.ravel()):
            th_full = pd.Timestamp( f'{y}-{th}')
            ax.axvline(x=th_full, color='black', linestyle='--')
    fig.suptitle(f)
    fig.tight_layout()
    fig.subplots_adjust(top=0.95)

In [None]:
# auser dataset
features_depth = [f for f in auser.columns if 'Depth' in f]
auser[auser.index.year >= 2005][features_depth].plot(subplots=True, layout=(3,2), figsize=(10*2,3*3));

There are some outliers with depgth going up to 0. Find these points and replace with nans

In [None]:
(auser[features_depth] == 0).sum()

In [None]:
auser[features_depth] = auser[features_depth].replace(0, np.nan)

In [None]:
features_depth = [f for f in auser.columns if 'Depth' in f]
auser[auser.index.year >= 2005][features_depth].plot(subplots=True, layout=(3,2), figsize=(10*2,3*3));

In [None]:
# doganella dataset
features_depth = [f for f in doganella.columns if 'Depth' in f]
doganella[doganella.index.year >= 2012][features_depth].plot(subplots=True, layout=(3,3), figsize=(10*3,3*3));

In [None]:
features_depth = [f for f in doganella.columns if 'Depth' in f]
doganella[doganella.index.year >= 2020][features_depth[1]].iloc[100:].plot(subplots=True, layout=(3,3), figsize=(10*3,3*3), style='-o');

In [None]:
features_depth = [f for f in doganella.columns if 'Depth' in f]
doganella[doganella.index.year >= 2020][features_depth[-2]].iloc[100:].plot(subplots=True, layout=(3,3), figsize=(10*3,3*3), style='-o');

In [None]:
luco.rename(columns=lambda x: str(x).replace('Groundwater_Pozzo', 'Groundwater_Pozzo_luco')).columns

In [None]:
# luco dataset
luco.columns = luco.rename(columns=lambda x: str(x).replace('Groundwater_Pozzo', 'Groundwater_Pozzo_luco')).columns
features_depth = [f for f in luco.columns if 'Depth' in f]
luco[luco.index.year >= 2008][features_depth].plot(subplots=True, layout=(2, 2), figsize=(10*2,2*3));

In [None]:
(luco[features_depth] == 0).sum()

In [None]:
luco[features_depth] = luco[features_depth].replace(0, np.nan)
features_depth = [f for f in luco.columns if 'Depth' in f]
luco[luco.index.year >= 2008][features_depth].plot(subplots=True, layout=(2, 2), figsize=(10*2,2*3));

In [None]:
# petrignano dataset
features_depth = [f for f in petrignano.columns if 'Depth' in f]
petrignano[petrignano.index.year >= 1800][features_depth].plot(subplots=True, layout=(1, 2), figsize=(10*2,1*3));

# Features preprocessing

**Rainfall**

For two rainfall features in auser dataset wrong data are used in the initial dataset. Let's download the new rainfall time-series and replace features on new. More explanation is [here](https://www.kaggle.com/declot/water-italy-aquifers-windows-for-ewm#Common-rainfalls-analysis)

In [None]:
# for 0-24 hours range
features = ['Rainfall_Scorgiano', 'Rainfall_Pentolina']

downloaded_features0_24 = pd.DataFrame(columns=['date'])
for ids in geo_data[geo_data.feature_name.isin(features)].ids:
#                                 https://www.sir.toscana.it/archivio/download.php?IDST=pluvio0_24&IDS=TOS03002742
    context = request.urlopen(f'https://www.sir.toscana.it/archivio/download.php?IDST=pluvio0_24&IDS={ids}').read().decode('utf-8')
    to_add = pd.DataFrame([x.split(';') for x in context[context.find("gg/mm/aaaa") -1:].replace('@', '').replace(',','.').split('\r\n')])
    to_add = to_add.iloc[1:, :-1]
    to_add.columns = ['date', f'{geo_data.loc[geo_data.ids==ids, "feature_name"].iloc[0]}']
    
    downloaded_features0_24 = downloaded_features0_24.merge(to_add, on='date', how='outer')
    
downloaded_features0_24.date = pd.to_datetime(downloaded_features0_24.date, dayfirst=True)
downloaded_features0_24.set_index('date', inplace=True)
downloaded_features0_24.sort_index(inplace=True)
downloaded_features0_24 = downloaded_features0_24.iloc[:-1]
downloaded_features0_24 = downloaded_features0_24.replace('', np.nan).astype('float')
    
# replace our data with outliers on downloaded one
for f in features:
    luco = luco.merge(downloaded_features0_24[f], left_index=True, right_index=True, how='left', suffixes=['_drop', ''])
    luco.drop(f'{f}_drop', axis=1, inplace=True)

**Volume**

In [None]:
# replace 0 on nan
features_volume = [f for f in auser.columns if 'Volume' in f]
auser[features_volume] = auser[features_volume].replace(0, np.nan)

# doganella volumes are positive. Make them negative, like other volumes in datasets
features_volume = [f for f in doganella.columns if 'Volume' in f]
doganella[features_volume] = -doganella[features_volume]

**Temperature**

In [None]:
# replace 0 on nans, if two or more 0 follow to each other
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features_temp = [f for f in df.columns if 'Temperature' in f]
    for f in features_temp:
        temp = df[f].copy().to_frame()
        temp['is0'] = False
        temp['is0'][temp[f].notna()] = temp[f].dropna().rolling(2).mean() == 0
        df[f][temp.is0] = np.nan

**Hydrometry**

In [None]:
features_hydrometry = [f for f in df.columns if 'Hydrometry' in f]
petrignano[features_hydrometry] = petrignano[features_hydrometry].replace(0, np.nan)

**Exp smoothing**

All analysis for current theme is [here](https://www.kaggle.com/declot/water-italy-aquifers-windows-for-ewm#Common-rainfalls-analysis)

In [None]:
def get_ewm_dataset(df, name, delta_win=0):
    df_total = pd.DataFrame()

    features_depth = [f for f in df.columns if 'Depth' in f]
    for f_main in features_depth:
        temp = df[f_main].rename('target').to_frame()
        temp['dataset'] = name
        temp['target_name'] = f_main
        for win_for_features in [rainfall_windows, volume_windows, temperature_windows, hydrometry_windows, depth_windows]:
            if f_main not in win_for_features.keys(): # if feature (like hydropmetry) not in dataset
                continue

            for f_ewm, window in win_for_features[f_main].items():
                if f_ewm not in df.columns:
                    continue
                if delta_win < 0 and window < np.abs(delta_win):
                    delta_win = -window
                temp[f_ewm] = df[f_ewm].ewm(window+delta_win,  min_periods=65).mean()

        df_total = pd.concat([df_total, temp])

    return df_total


def plot_all_features_grouped_target(df_total):
    for i, gr in df_total.groupby('target_name'):
#         display(gr)
        ncols = 3
        nrows = (gr.shape[1] - 2) // ncols
        nrows = nrows + 1 if  (gr.shape[1]-2) % ncols != 0 else nrows
        fig, ax = plt.subplots(figsize=(ncols*7,nrows*1.5))
        gr.plot(subplots=True, layout=(nrows,ncols), ax=ax)

In [None]:
temp = {'Depth_to_Groundwater_PAG': 'Groundwater_PAG',
              'Depth_to_Groundwater_DIEC': 'Groundwater_DIEC'}
auser.rename(columns=temp, inplace=True)
for k,v in depth_windows.items():
    for k2,v2 in temp.items():
        try:
            depth_windows[k][v2] =  depth_windows[k][k2]
            depth_windows[k].pop(k2)
        except KeyError:
            continue
            
            
temp = {'Depth_to_Groundwater_Pozzo_luco_1':'Groundwater_Pozzo_luco_1',
                     'Depth_to_Groundwater_Pozzo_luco_3':'Groundwater_Pozzo_luco_3',
                     'Depth_to_Groundwater_Pozzo_luco_4': 'Groundwater_Pozzo_luco_4'}
luco.rename(columns=temp, inplace=True)
for k,v in depth_windows.items():
    for k2,v2 in temp.items():
        try:
            depth_windows[k][v2] =  depth_windows[k][k2.replace('_luco', '')]
            depth_windows[k].pop(k2.replace('_luco', ''))
        except KeyError:
            continue

In [None]:
auser_df = get_ewm_dataset(auser, 'auser')
plot_all_features_grouped_target(auser_df)

In [None]:
doganella_df = get_ewm_dataset(doganella, 'doganella')
plot_all_features_grouped_target(doganella_df)

In [None]:
luco_df = get_ewm_dataset(luco, 'luco')
plot_all_features_grouped_target(luco_df)

In [None]:
petrignano_df = get_ewm_dataset(petrignano, 'petrignano')
plot_all_features_grouped_target(petrignano_df)

# Models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit

In [None]:
auser_df.sort_index(inplace=True)
doganella_df.sort_index(inplace=True)
luco_df.sort_index(inplace=True)
petrignano_df.sort_index(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
for df in [auser_df, doganella_df, luco_df, petrignano_df]:
    for i, gr in df.groupby('target_name'):
        temp = gr.target.dropna()
        ax.plot([temp.index.min(), temp.index.max()], [i,i], marker='|')

In [None]:
print(luco_df.groupby('target_name').apply(lambda x: x.target.dropna().index.min())[-1], luco_df[luco_df.target.notna()].index.max())
(luco_df[luco_df.target.notna()].index.max() - luco_df.groupby('target_name').apply(lambda x: x.target.dropna().index.min())[-1])

In [None]:
1033/365

In [None]:
(1033-181)/365

In [None]:
def split_by_period(df, test_period = 181):
    train, test = pd.DataFrame() ,pd.DataFrame()
    for i,gr in df.groupby('target_name', as_index=False):
        test_start = gr.index.max() - pd.Timedelta(test_period, unit='d')
        train = pd.concat([train, gr[gr.index < test_start]])
        test = pd.concat([test,  gr[gr.index >= test_start]])
    return train, test
        
auser_train, auser_test = split_by_period(auser_df)
doganella_train, doganella_test = split_by_period(doganella_df)
luco_train, luco_test = split_by_period(luco_df)
petrignano_train, petrignano_test = split_by_period(petrignano_df)

In [None]:
!pip install neptune-client

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

import lightgbm as lgb

from statsmodels.stats.weightstats import _tconfint_generic

import neptune
# from neptunecontrib.monitoring.lightgbm import neptune_monitor
import pickle
neptune_token = pickle.load(open('/kaggle/input/tokens/neptune_token.pkl', 'rb'))
neptune.init(project_qualified_name='declot/Water-Italy-aquifer', # change this to your `workspace_name/project_name`
             api_token=neptune_token, # change this to your api token
            )
# neptune.init(project_qualified_name='declot/Water-Italy-aquifer', # change this to your `workspace_name/project_name`
#              api_token='ANONYMOUS'#neptune_token, # change this to your api token
#             )

In [None]:
def plot_eval_hist(ax, metric_by_iter_val, metric_by_iter_train, n_estim):
    ax.plot(range(n_estim), metric_by_iter_train, label='train')
    ax.plot(range(n_estim), metric_by_iter_val, label='val')
    ax.set_xlabel('n_estimators')
    ax.set_ylabel('rmse')
    ax.legend() 
                   
def plot_eval_history_rf(ax, model, n_estim, X_train, y_train ,X_val, y_val):
    metric_by_iter_val, metric_by_iter_train = [], []
    for i in range(n_estim):
        pred = model.estimators_[i].predict(X_val)
        metric_by_iter_val.append(mean_absolute_percentage_error(y_val, pred))
        pred = model.estimators_[i].predict(X_train)
        metric_by_iter_train.append(mean_absolute_percentage_error(y_train, pred))
    plot_eval_hist(ax, metric_by_iter_val, metric_by_iter_train, n_estim)
    
def plot_eval_history_lgb(ax, eval_hist):
    metric_by_iter_val, metric_by_iter_train = eval_hist['val']['rmse'], eval_hist['train']['rmse']
    n_estim = len(metric_by_iter_val)
    plot_eval_hist(ax, metric_by_iter_val, metric_by_iter_train, n_estim)
       

            
            
# def plot_eval_history_lgb(ax, model, n_estim, X_train, y_train ,X_val, y_val):
    
#     ax.annotate(f_name, xy=(0, 0.5), xytext=(-ax.yaxis.labelpad - 5, 0),
#         xycoords=ax.yaxis.label, textcoords='offset points',
#         size='large', ha='right', va='center')
#     ax.plot(range(n_estim, metric_by_iter_train, label='train')
#     ax.plot(range(n_estim), metric_by_iter_val, label='val')
#     ax.set_xlabel('n_estimators')
#     ax.set_ylabel('mape')
#     ax.legend()
            
def plot_res_cv_model(res_cv, cv, gr, axes, tags, f_name):
    train_idx, val_idx = list(cv.split(gr))[-1]
    train_idx, val_idx = list(cv.split(gr))[-1]
    X_val = gr.drop(['target', 'dataset', 'target_name'], axis=1).iloc[val_idx]
    y_val, y_train = gr.target.iloc[val_idx], gr.target.iloc[train_idx]
    X_train = gr.drop(['target', 'dataset', 'target_name'], axis=1).iloc[train_idx]

    if 'rf' in tags:
        model = res_cv['estimator'][-1].fit(X_train, y_train)
        plot_eval_history_rf(axes[0], model, params_rf['n_estimators'],
                             X_train, y_train ,X_val, y_val)
    elif 'lgb' in tags:
        eval_hist = {}
        model = res_cv['estimator'][-1].fit(X_train, y_train, 
                                            eval_set=[(X_val, y_val), (X_train, y_train)],
                                            eval_names=('val','train' ), verbose=False,
                                            callbacks=[lgb.record_evaluation(eval_hist)])
        plot_eval_history_lgb(axes[0], eval_hist)
        m = -np.mean(res_cv['test_score'])
        axes[0].plot([0, len(eval_hist['val']['rmse'])],
                     [m]*2, color='black', linestyle='--')
        ci = _tconfint_generic(m, np.std(res_cv['test_score'])/np.sqrt(cv.n_splits), cv.n_splits-1, 0.05, '2s')
        axes[0].fill_between([0, len(eval_hist['val']['rmse'])], [ci[0]]*2, [ci[1]]*2, 
                             color='black', alpha=0.25)
        
    y_pred = model.predict(X_val)
    resid = y_val - y_pred
    
    m = -np.mean(res_cv['test_score'])
    axes[0].annotate(f_name, xy=(0, 0.5), xytext=(-axes[0].yaxis.labelpad-5, 0),
            xycoords=axes[0].yaxis.label, textcoords='offset points',
            size='large', ha='right', va='center')
    
    axes[0].annotate(f'{round(m, 3)}', 
                     xy=(0, 0.), xytext=(-axes[0].yaxis.labelpad-5, 0),
            xycoords=axes[0].yaxis.label, textcoords='offset points',
            size='large', ha='right', va='center')
    axes[1].plot(y_val, label='val true')
    axes[1].plot(pd.Series(y_pred, index=y_val.index), label='val pred')
    axes[1].legend()

    axes[2].scatter(y_val, y_pred)
    axes[2].plot(y_val, y_val, linestyle='--', c='black')
    axes[2].set_xlabel('y_true')
    axes[2].set_title('y_pred')

    axes[3].scatter(y_val, resid)
    axes[3].plot(y_val, [0]*y_val.shape[0], linestyle='--', c='black')
    axes[3].set_xlabel('y_ture')
    axes[3].set_title('residuals')
            

#     return to_return, list(df.groupby('target_name').indices.keys())



#             for i, (train_idx, val_idx) in enumerate(cv.split(gr), 1):
#                 temp = gr.drop(['target', 'dataset', 'target_name'], axis=1)
#                 X_train, y_train = temp.iloc[train_idx], gr.target.iloc[train_idx]
#                 X_val, y_val = temp.iloc[val_idx], gr.target.iloc[val_idx]
                
#                 train, val = lgb.Dataset(X_train, y_train), lgb.Dataset(X_val, y_val)
#                 res_cv = lgb.train(params, train, 100, verbose_eval=10, 
#                                    valid_sets=[train, val], valid_names=['train','valid'],
# #                                    callbacks=[neptune_monitor(prefix=f'cv{i}_')]
#                                     )


#             train_dataset = lgb.Dataset(gr.drop(['target', 'dataset', 'target_name'], axis=1).iloc[train_idx], 
#                                   gr.target.iloc[train_idx])
#             val_dataset = lgb.Dataset(gr.drop(['target', 'dataset', 'target_name'], axis=1).iloc[val_idx], 
#                                   gr.target.iloc[val_idx], free_raw_data=False)
#             model = lgb.train(params, train_dataset, params['num_iter'], (val_dataset), verbose_eval=0)
#             y_pred = model.predict(val_dataset.get_data())
#             y_true = val_dataset.get_label()
#             resid = y_true - y_pred
    
    

def model_lgb_cv(temp, params, n, cv, categorical='auto'):
    dataset = lgb.Dataset(temp.drop(['target', 'dataset', 'target_name'], axis=1), temp.target)
    return lgb.cv(params, dataset, n, folds=cv, shuffle=False, eval_train_metric=True,
        verbose_eval=0)


def model_lgb_sklearn_cv(temp, params, cv):
    model = lgb.LGBMRegressor(**params)
    return cross_validate(model, temp.drop(['target', 'dataset', 'target_name'], axis=1), temp.target,
                            scoring='neg_root_mean_squared_error', cv=cv, 
                            return_train_score=True, return_estimator=True, n_jobs=-1)

def plot_iter_score_cv(ax, res_cv, dataset='train', metric='rmse'):
    template = f'{dataset} {metric}'
    x = range(len(res_cv[f'{template}-mean']))
    ax.plot(x, res_cv[f'{template}-mean'], label=dataset)
    ci = _tconfint_generic(res_cv[f'{template}-mean'], 
                       np.array(res_cv[f'{template}-stdv'])/np.sqrt(cv.get_n_splits()),
                       dof=cv.get_n_splits()-1, alpha=0.05, alternative='2s')
    ax.fill_between(x, ci[0], ci[1], alpha=0.25)
    ax.set_title(metric)
    ax.legend()

    if dataset == 'valid':
        y = res_cv[f'{template}-mean'][-1]
        ax.plot(x, [y]*len(x), '--')
        ax.text(x[-1]-10, y+0.1*y, f'{round(y, 3)}')

In [None]:
# for df, df_name in zip([auser_train, doganella_train, luco_train, petrignano_train],
#                         ['auser', 'doganella', 'luco', 'petrignano']):

def train(df, dataset_name, params, params_exp,
          name_exp='lgb', tags=['lgb'], plot_res=True):
    if name_exp is not None:
        tags = tags + [dataset_name]
        neptune_exp = neptune.create_experiment(params=dict(**params, **params_exp),
                                                name=name_exp, tags=tags)
    ncols, nrows = 4, df.target_name.nunique()
    if plot_res:
        fig, axes_all = plt.subplots(nrows, ncols, figsize=(ncols*4, nrows*3))
    else: axes_all = np.zeros((nrows,))   

    to_return = [[],[], []]
    for (f_name, gr), axes in zip(df.groupby('target_name', sort=False), axes_all):  
        f_name = f_name.replace('Depth_to_Groundwater_', '')
        gr = preprocess(gr) # common preprocces data
        n_splits = 10 if (gr.shape[0]-1) // 181 > 10 else (gr.shape[0]-1) // 181
        cv = TimeSeriesSplit(n_splits=n_splits, test_size=181)

        if 'rf' in tags:
            pipe = RandomForestRegressor(**params, n_jobs=-1)
            res_cv = cross_validate(pipe, gr.drop(['target', 'dataset', 'target_name'], axis=1), gr.target,
                                   scoring='neg_root_mean_squared_error', cv=cv, 
                                   return_train_score=True, return_estimator=True, n_jobs=-1)
        elif 'lgb' in tags:
#                 res_cv = model_lgb_sklearn_cv(gr, params, cv) # fit cv
            pipe = lgb.LGBMRegressor(**params)
            res_cv = cross_validate(pipe, gr.drop(['target', 'dataset', 'target_name'], axis=1), gr.target,
                        scoring='neg_root_mean_squared_error', cv=cv, 
                        return_train_score=True, return_estimator=True, n_jobs=-1)
#         print(res_cv['test_score'])
        to_return[0].append(-np.mean(res_cv['test_score']))
        to_return[1].append(np.std(res_cv['test_score'])/np.sqrt(n_splits))
        to_return[2].append(n_splits)
        if name_exp is not None: 
            neptune.log_metric(f'{dataset_name}_{f_name}_rmse',
                               -np.mean(res_cv['test_score'])) 

        if plot_res:
            print('---------------------------')
            print(f_name)
            print('cv nsplit', n_splits)
            print(-np.mean(res_cv['test_score']))

            plot_res_cv_model(res_cv, cv, gr, axes, tags=tags, f_name=f_name)

    if name_exp is not None:
        neptune.log_image(f'{dataset_name}', fig)
        neptune.stop()
        
    return to_return

In [None]:
# neptune.stop()

In [None]:
params_exp = dict(future_days=0)
name = 'auser dropall'
params_lgb = dict(n_estimators=100,
                  objective='regression_l2',
                  lambda_l2=2,
                  metrics=['rmse', 'mape'],
                 #linear_tree,
                 learning_rate=0.1,
                 num_leaves=31,
                 extra_trees=False,
#                  early_stopping_rounds=5
                 )

params_rf = dict(n_estimators=100, max_depth=8, max_features=0.3,
                min_samples_leaf=5, ccp_alpha=0)

def preprocess(gr):
    assert gr.shape == gr.resample('1d').last().shape
#     temp = gr[gr.target.notna()]
    temp = gr.dropna()
    get_time_features(temp)
    temp.drop('year', axis=1, inplace=True)
#     print(f'data shape:{temp.shape}')
    return temp
    
    
train(doganella_df, 'doganella', params_lgb, params_exp,
      name_exp='doganella_ini', tags=['lgb'], plot_res=True)


In [None]:
params_rf = dict(n_estimators=100, max_depth=4, max_features=0.3,
                min_samples_leaf=5, ccp_alpha=0)
  
    
train(doganella_df, 'doganella', params_rf, params_exp,
      name_exp='doganella ini rmse', tags=['rf'], plot_res=True)


In [None]:
from tqdm.notebook import tqdm
from matplotlib import cm

In [None]:
def model_metrics_by_lag_delta_windows(df, dataset_name, lags, delta_windows,
                                      params, params_exp):
    n = len([f for f in df.columns if 'Depth' in f])
    results_mean = np.zeros((n, lags.shape[0], delta_wins.shape[0]))
    results_std = np.zeros((n, lags.shape[0], delta_wins.shape[0]))
    cvs = np.zeros((n, lags.shape[0], delta_wins.shape[0]))
    for (i,l),(j,w) in tqdm(product(enumerate(lags), enumerate(delta_wins)), 
                            total=lags.shape[0] * delta_wins.shape[0]):
        temp_df = get_ewm_dataset(df, dataset_name, delta_win=w)
        temp_df.target = temp_df.groupby('target_name', sort=False).target.shift(-l)
        res = train(temp_df, dataset_name, params, params_exp,
                    name_exp=None, tags=['rf'], plot_res=False)

        for f in range(n):
            results_mean[f,i,j], results_std[f,i,j] = res[0][f], res[1][f]
            cvs[f,i,j] = res[2][f]
    
    pickle.dump(results_mean, open(f'results_mean_{dataset_name}.pkl', 'wb'))
    pickle.dump(results_std, open(f'results_std_{dataset_name}.pkl', 'wb'))
    pickle.dump(cvs, open(f'cvs_{dataset_name}.pkl', 'wb'))
    
    return results_mean, results_std, cvs

In [None]:
def plot3d_metrics_by_lag_delta_window(results_mean, lags, delta_wins, f_names):
    col = 5 if len(f_names) >5 else len(f_names)
    row = results_mean.shape[0] // col +1
    fig = plt.figure(figsize=(col*5, row*5))
                 
    for i in range(results_mean.shape[0]):
        ax = fig.add_subplot(row, col,i+1, projection='3d')
        ax.set_title(f_names[i].replace('_to_Groundwater_','_'))
        X, Y = lags, delta_wins
        Y, X = np.meshgrid(Y, X)
        Z = -results_mean[i]
        surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                           linewidth=0, antialiased=False)
    #         coor = np.unravel_index(Z.argmax(), Z.shape)
    #         label = f'lag{lags[coor[0]]} wind{ewms[coor[1]]} coor{round(Z.max(), 2)}'
    #         ax.text(lags[coor[0]], ewms[coor[1]], Z.max(), label)

        ax.set_xlabel('lag')
        ax.set_ylabel('window')
        ax.set_zlabel('rmse')        
        
        
def plot2d_metrics_by_lag_delta_window(results_mean, results_std, cvs,
                                       lags, delta_wins, f_names):
    col=4
    row = lags.shape[0] // col +1
    for f in range(len(f_names)):
        fig, axes = plt.subplots(row, col, figsize=(col*5, row*4), sharey=True)
        fig.suptitle(f_names[f])
        fig.subplots_adjust(top=0.95)
        for (i, l), ax in zip(enumerate(lags), axes.ravel()):
            ax.plot(delta_wins, results_mean[f][i])
            ci = _tconfint_generic(results_mean[f][i], results_std[f][i], cvs[f][i], 0.05, '2s')
    #         print(ci)
            ax.fill_between(delta_wins,ci[0], ci[1], color='black', alpha=0.25 )
            ax.set_title(f'lag={l}')


In [None]:
lags = np.array(list(range(0,8)) + list(range(10, 35, 5)))
delta_wins = np.arange(-25, 25, 5)
# results_mean_auser,results_std_auser, cvs_auser  = model_metrics_by_lag_delta_windows(auser, 'auser', 
#                                                         lags, delta_wins,
#                                                         params_rf, params_exp)

results_mean_auser = pickle.load(open('/kaggle/input/geo-data-water-italia/results_mean_auser.pkl', 'rb'))
results_std_auser = pickle.load(open('/kaggle/input/geo-data-water-italia/results_std_auser.pkl', 'rb'))
cvs_auser = pickle.load(open('/kaggle/input/geo-data-water-italia/cvs_auser.pkl', 'rb'))

In [None]:
f_names = auser_df.target_name.unique()
plot3d_metrics_by_lag_delta_window(results_mean_auser, lags, delta_wins, f_names)

In [None]:
plot2d_metrics_by_lag_delta_window(results_mean_auser, results_std_auser, cvs_auser,
                                   lags, delta_wins, f_names)

In [None]:
lags = np.array(list(range(0, 8)) + list(range(10, 35, 5)))
delta_wins = np.arange(-25, 25, 5)
# results_mean_doganella,results_std_doganella, cvs_doganella = model_metrics_by_lag_delta_windows(doganella, 'doganella', 
#                                                         lags, delta_wins,
#                                                         params_rf, params_exp)

results_mean_doganella = pickle.load(open('/kaggle/input/geo-data-water-italia/results_mean_doganella.pkl', 'rb'))
results_std_doganella = pickle.load(open('/kaggle/input/geo-data-water-italia/results_std_doganella.pkl', 'rb'))
cvs_doganella = pickle.load(open('/kaggle/input/geo-data-water-italia/cvs_doganella.pkl', 'rb'))

f_names = doganella_df.target_name.unique()
plot3d_metrics_by_lag_delta_window(results_mean_doganella, lags, delta_wins, f_names)
plot2d_metrics_by_lag_delta_window(results_mean_doganella, results_std_doganella, cvs_doganella,
                                   lags, delta_wins, f_names)

In [None]:
(luco_df.isna().sum()/luco_df.shape[0]).sort_values()

In [None]:
to_drop = luco_df.columns[(luco_df.isna().sum()/luco_df.shape[0]) > 0.8]
to_drop

In [None]:
lags = np.array(list(range(0, 8)) + list(range(10, 35, 5)))
delta_wins = np.arange(-25, 25, 5)

# results_mean_luco,results_std_luco, cvs_luco = \
#         model_metrics_by_lag_delta_windows(luco.drop(columns=to_drop), 'luco', 
#                                            lags, delta_wins,
#                                            params_rf, params_exp)

results_mean_luco = pickle.load(open('/kaggle/input/geo-data-water-italia/results_mean_luco.pkl', 'rb'))
results_std_luco = pickle.load(open('/kaggle/input/geo-data-water-italia/results_std_luco.pkl', 'rb'))
cvs_luco = pickle.load(open('/kaggle/input/geo-data-water-italia/cvs_luco.pkl', 'rb'))

f_names = luco_df.target_name.unique()
plot3d_metrics_by_lag_delta_window(results_mean_luco, lags, delta_wins, f_names)
plot2d_metrics_by_lag_delta_window(results_mean_luco, results_std_luco, cvs_luco,
                                   lags, delta_wins, f_names)


In [None]:
lags = np.array(list(range(0, 8)) + list(range(10, 35, 5)))
delta_wins = np.arange(-25, 25, 5)
# results_mean_petrignano,results_std_petrignano, cvs_petrignano  = model_metrics_by_lag_delta_windows(petrignano, 'petrignano', 
#                                                         lags, delta_wins,
#                                                         params_rf, params_exp)

results_mean_petrignano = pickle.load(open('/kaggle/input/geo-data-water-italia/results_mean_petrignano.pkl', 'rb'))
results_std_petrignano = pickle.load(open('/kaggle/input/geo-data-water-italia/results_std_petrignano.pkl', 'rb'))
cvs_petrignano = pickle.load(open('/kaggle/input/geo-data-water-italia/cvs_petrignano.pkl', 'rb'))

f_names = petrignano_df.target_name.unique()
plot3d_metrics_by_lag_delta_window(results_mean_petrignano, lags, delta_wins, f_names)
plot2d_metrics_by_lag_delta_window(results_mean_petrignano, results_std_petrignano, cvs_petrignano,
                                   lags, delta_wins, f_names)

In [None]:
def highlight_min(s):
    is_min = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_min]


res_mean = pd.DataFrame()
res_std = pd.DataFrame()
for m, st, name in zip((results_mean_auser, results_mean_doganella, results_mean_luco, results_mean_petrignano),
                       (results_std_auser, results_std_doganella, results_std_luco, results_std_petrignano),
                       ('auser', 'doganella', 'luco', 'petrignano')):
    idx = [0,7,9,10,12] 
    
    for target in range(m.shape[0]):
        res_mean = pd.concat([res_mean,
                              pd.Series(m[target].mean(axis=1)[idx], 
                                        index=[0,7,15,20,30], name=f'{name}_mean{target}')],
                             axis=1)
        res_std = pd.concat([res_std, 
                             pd.Series(st[target].mean(axis=1)[idx], 
                                       index=[0,7,15,20,30], name=f'{name}_std{target}')],
                            axis=1)
        
res_mean_diff = (res_mean - res_mean.iloc[0]).drop(index=0)


In [None]:
display(res_mean_diff.style.apply(highlight_min),
        res_std.drop(index=0) * t.ppf(0.975, res_cvs-1))

In [None]:
res_cvs = np.concatenate([f.mean(axis=(1,2)).round(0) for f in [cvs_auser, cvs_doganella, cvs_luco, cvs_petrignano]])
temp = res_mean_diff.copy()
temp[pd.DataFrame(res_mean_diff.abs() >= \
                           (res_std.drop(index=0).values * t.ppf(0.975, res_cvs-1)))] = np.nan
temp.style.apply(highlight_min)


In [None]:
res_cvs = np.concatenate([f.mean(axis=(1,2)).round(0) for f in [cvs_auser, cvs_doganella, cvs_luco, cvs_petrignano]])
temp = res_mean_diff.copy()
temp[res_mean_diff.abs() >= res_std.drop(index=0).values] = np.nan
temp.style.apply(highlight_min)


In [None]:

def preprocess(gr):
    assert gr.shape == gr.resample('1d').last().shape
    gr.target = gr.target.shift(params_exp['future_days'])
    temp = gr[gr.target.notna()]
    get_time_features(temp)
    temp.drop('year', axis=1, inplace=True)
    
    
# #     temp = gr.dropna()
#     features_rainfall = [f for f in temp.columns if 'Rainfall' in f]
# #     display(temp[features_rainfall].isna().sum().sort_values())
#     temp['rainfall_mean'] = temp[features_rainfall].mean(axis=1)
# #     th = temp.shape[0] * 0.5
# #     for f in features_rainfall:
# #         if temp[f].isna().sum() >= th:
# #             temp.drop(f, axis=1, inplace=True)
    
# #     temp.drop([f for f in temp.columns if 'Groundwater' in f], axis=1, inplace=True)
    
# #     features_volume = [f for  f in temp.columns if 'Volume' in f]
# #     temp = temp[temp[features_volume].notna().any(axis=1)]
#     display(temp.isna().sum())


In [None]:
def plot_df_with_na(df):
    ncols = 3
    n = df.select_dtypes('float').shape[1]
    nrows = n // ncols
    nrows = nrows + 1 if n % ncols != 0 else nrows
    fig, axes = plt.subplots(nrows,ncols, figsize=(ncols*10, nrows*3))
    for f, ax in zip(df.select_dtypes('float').columns, axes.ravel()):
        temp = df[f].copy().to_frame()
        temp['na'] = temp[f].isna()
#         temp['isna'][temp[f].notna()] = temp[f].dropna().rolling(2).mean() == 0
        temp[f].plot(ax=ax, title=f)
        ax.scatter(temp[temp.na].index, [0]*temp[temp.na].shape[0], c='r')        
# luco_df[luco_df.target.notna()].plot(subplots=True, layout=(20,3), ax=ax);

In [None]:
plot_df_with_na(luco_df[luco_df.target.notna()])

In [None]:
fig, axes = plt.subplots(9,1, figsize=(20,9*4))

for (i, gr), ax in zip(doganella_df[doganella_df.index.year >= 2012].groupby('target_name'),
                      axes.ravel()):
    gr.target.plot(ax=ax)
    ax.scatter(gr[gr.target.isna()].index, 
               gr.groupby('target_name').target.transform('max')[gr.target.isna()], c='r')
    

In [None]:
fig, axes = plt.subplots(9,1, figsize=(25,9*5))

for (i, gr), ax in zip(doganella_df[doganella_df.index.year >= 2012].groupby('target_name'),
                      axes.ravel()):
    temp = gr.target.rolling(5).mean()
    gr.target.plot(ax=ax, c='black', alpha=0.5)
    temp.plot(ax=ax)
    ax.scatter(temp[temp.isna()].index, 
               gr.groupby('target_name').target.transform('max')[temp.isna()], c='r')
    

In [None]:
features_depth = [f for f in doganella.columns if 'Depth' in f][2:6:3]
# doganella_df
doganella[doganella.index.year >= 2019][features_depth].plot()

In [None]:
fig, ax = plt.subplots(figsize=(30, 40))
doganella[doganella.index.year >= 2019].plot(subplots=True, layout=(20,2), ax=ax);

In [None]:
fig, ax = plt.subplots(figsize=(30, 40))
doganella.plot(subplots=True, layout=(20,2), ax=ax);

In [None]:
plot_all_features_grouped_target(doganella_df[(doganella_df.index.year >= 2019)&(doganella_df.target_name.isin(features_depth))])