In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd 
from itertools import product
import pickle
import scipy.stats as ss

import missingno as msno

%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import HTML
import seaborn as sns
pd.options.plotting.backend =  'matplotlib'#"plotly"

import IPython
def display(*dfs):
    for df in dfs:
        IPython.display.display(df)

# Functions

In [None]:
def get_time_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['weekofyear'] = df.index.isocalendar().week
    df['day'] = df.index.day
    df['dayofyear'] = df.index.dayofyear
    df['quarter'] = df.index.quarter

In [None]:
## visualization functions

def autolabel(xx, yy, names, ax):
    for x, y, ann in zip(xx, yy, names):
        ax.annotate('{}'.format(ann),
                    xy=(x, y),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        
        
def plot_window_corr(df, features_main, features_lag, trend='positive', ewm_max=7):
    """ Calculate correlation coeffitients between targets (from features_main) and 
    fueatures_lag. Features are exp smoothed for different windows (from 0 up to ewm_max).
    Results are ploted on grapth with highlighted optimal values of window.
    Also, resuls with optimel windows are returned as list of dict"""
    
    res_lags = []
    rows, cols = len(features_main),len(features_lag)
    fig, axes = plt.subplots(rows,cols, figsize=(cols*4,rows*4))

    if not isinstance(axes, np.ndarray): axes = np.array([[axes]])
    if axes.shape != (rows, cols): axes = axes.reshape((rows, cols))
    for f1,ax in zip(features_main, axes):
        for f2, ax0 in zip(features_lag, ax):
            res, to_add = [], {}
            to_add['f_main'] = f1
            to_add['f_lag'] = f2
            for i in range(ewm_max):
                temp = df[[f1,f2]].copy()#.ewm(i).mean()
                temp[f2] = temp[f2].ewm(i,  min_periods=65).mean()#.shift(i)
                temp = temp.dropna()
                res.append(ss.pearsonr(temp[f1], temp[f2]))

            res = pd.DataFrame(res, columns=['corr', 'p'])
            res['significant'] = np.where(res.p <= 0.05,'black', 'blue')
            res.reset_index().plot(kind='scatter', x='index', y='corr', 
                                   color=res['significant'], ax=ax0)
            
            if trend == 'positive':
                res.loc[res['corr'] <= 0, 'corr']  = np.nan # take only positives
                ax0.plot([res['corr'].argmax()]*2, [res['corr'].min(), res['corr'].max()])
                autolabel([res['corr'].argmax()], [res['corr'].max()], [f'{res["corr"].argmax()}'], ax=ax0)
                to_add['lag'] = res["corr"].argmax()
            elif trend == 'negative':
                res.loc[res['corr'] >= 0, 'corr']  = np.nan # take only negatives 
                ax0.plot([res['corr'].argmin()]*2, [res['corr'].min(), res['corr'].max()])
                autolabel([res['corr'].argmin()], [res['corr'].min()], [f'{res["corr"].argmin()}'], ax=ax0)
                to_add['lag'] = res["corr"].argmin()
            
            res_lags.append(to_add)
            ax0.set_title(f1)

    for ax, f2 in zip(axes.ravel()[:cols], features_lag):
        ax.set_title(f2,  fontweight='bold')
        
    return res_lags


def format_res_windows(res_windows, features_depth, features_rainfall):
    res = pd.pivot_table(pd.DataFrame(res_windows), index='f_main', columns='f_lag', values='lag').reindex(index=features_depth, columns=features_rainfall)
    return res, res.style.background_gradient("Blues_r", axis=1)


def plot_corr_for_targets(features_depth, features_rainfall, res_windows, auser_df):
    rainfall_corr = []
    for f_main, f_lag in product(features_depth, [f for f in features_rainfall]):
        window = res_windows.loc[f_main, f_lag]
        name = f'{f_lag}_{window}'
        rainfall_corr.append(dict(f_main=f_main, f_lag=name, corr=auser_df[[f_main]].corrwith(auser_df[name]).iloc[0]))

    rainfall_corr = pd.DataFrame(rainfall_corr)

    fig, axes = plt.subplots(1, features_depth.__len__(), figsize=(25,5))
    fig.subplots_adjust(wspace=1)
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    for (f_main, gr), ax in zip(rainfall_corr.groupby('f_main'), 
                                axes.ravel()):
        gr['f_lag'] = gr['f_lag'].str.replace('Rainfall_', '').str.replace('_mean', '')
        ax.set_title(f_main)
        sns.heatmap(gr.drop('f_main', axis=1).set_index('f_lag'),vmin=0, vmax=1, annot=True,
                    cmap=cmap, ax=ax, cbar=False)
        ax.set_ylabel(None)
        
    return rainfall_corr


def plot_dependencies_by_month(features_depth, features_lags, auser_df, rainfall_window):
    rows, cols = len(features_depth), len(features_lags)
    fig ,axes = plt.subplots(rows, cols,
                             figsize=( len(features_lags)*4, len(features_depth)*4))
    if axes.shape != (rows, cols): axes = axes.reshape((rows, cols))
        
    for f_main, axes_depth in zip(features_depth, axes):
        features_lags = rainfall_window[f_main].keys()
        for f_lag, ax in zip(features_lags, axes_depth):
            f_lag = f'{f_lag}_{rainfall_window[f_main][f_lag]}'
            temp = auser_df.groupby('month')[[f_main, f_lag]].mean().reset_index()
            temp.plot(x=f_lag, y=f_main, ax=ax, 
                      colormap='Paired', style='-o')

            for l1 in range(0, temp.shape[0]):
                delta = (temp[f_lag].max() - temp[f_lag].min())*0.05
                ax.text(temp[f_lag][l1]+delta, temp[f_main][l1], temp['month'][l1], 
                        horizontalalignment='left', 
                        size='medium', color='black', 
                        weight='semibold')


# Datasets

In [None]:
auser = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Auser.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()
petrignano = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Petrignano.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()
doganella = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Doganella.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()
luco = pd.read_csv('/kaggle/input/acea-water-prediction/Aquifer_Luco.csv', parse_dates=['Date'], dayfirst=True,)\
                    .rename(columns={'Date':'date'}).set_index('date').sort_index()

# Location features map
Visualize geo data for all features

In [None]:
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Nominatim
import folium

locations = {}

locations['Settefrati'] = {'lat' : 41.669624, 'lon' : 13.850011 }
locations['Velletri'] = {'lat' : 41.6867015, 'lon' : 12.7770433 }
locations['Petrignano'] = {'lat' : 43.1029282, 'lon' : 12.5237369 }
locations['Piaggione'] = {'lat' : 43.936794, 'lon' : 10.5040929 }
locations['S_Fiora'] = {'lat' : 42.854, 'lon' : 11.556 }
locations['Abbadia_S_Salvatore'] = {'lat' : 42.8809724, 'lon' : 11.6724203 }
locations['Vetta_Amiata'] = {'lat' : 42.8908958, 'lon' : 11.6264863 }
locations['Castel_del_Piano'] = {'lat' : 42.8932352, 'lon' : 11.5383804 }
locations['Terni'] = {'lat' : 42.6537515, 'lon' : 12.43981163 }
locations['Bastia_Umbra'] = {'lat' : 43.0677554, 'lon' : 12.5495816  }
locations['S_Savino'] = {'lat' : 43.339, 'lon' : 11.742 }
locations['Monteroni_Arbia_Biena'] = {'lat' : 43.228279, 'lon' : 11.4021433 }
locations['Monticiano_la_Pineta'] = {'lat' : 43.1335066 , 'lon' : 11.2408464 }
locations['Montalcinello'] = {'lat' : 43.1978783, 'lon' : 11.0787906 }
locations['Sovicille'] = {'lat' : 43.2806018, 'lon' : 11.2281756 }
locations['Simignano'] = {'lat' : 43.2921965, 'lon' : 11.1680079 }
locations['Mensano'] = {'lat' : 43.3009594 , 'lon' : 11.0548528 }
locations['Siena_Poggio_al_Vento'] = {'lat' : 43.1399762, 'lon' : 11.3832092 }
locations['Scorgiano'] = {'lat' : 43.3521445 , 'lon' : 11.15867 }
locations['Ponte_Orgia'] = {'lat' : 43.2074581 , 'lon' : 11.2504416 }
locations['Pentolina'] = {'lat' : 43.1968029, 'lon' : 11.1754672 }
locations['Montevarchi'] = {'lat' : 43.5234999, 'lon' : 11.5675911 }
locations['Incisa'] = {'lat' : 43.6558723, 'lon' : 11.4526838 }
locations['Camaldoli'] = {'lat' : 43.7943293, 'lon' : 11.8199481 }
locations['Bibbiena'] = {'lat' : 43.6955475, 'lon' : 11.817341 }
locations['Stia'] = {'lat' : 43.801537, 'lon' : 11.7067347 }
locations['Laterina'] = {'lat' : 43.5081823, 'lon' : 11.7102588 }
locations['Monteporzio'] = {'lat' : 41.817251, 'lon' : 12.7050839 }
locations['Pontetetto'] = {'lat' : 43.8226294, 'lon' : 10.4940843 }
locations['Ponte_a_Moriano'] = {'lat' : 43.9083609 , 'lon' : 10.5342488 }
locations['Calavorno'] = {'lat' : 44.0217216, 'lon' : 10.5297323 }
locations['Borgo_a_Mozzano'] = {'lat' : 43.978948, 'lon' : 10.545703  }
locations['Gallicano'] = {'lat' : 44.0606512, 'lon' : 10.435668  }
locations['Tereglio_Coreglia_Antelminelli'] = {'lat' : 44.0550548 , 'lon' : 10.5623594 }
locations['Lucca_Orto_Botanico'] = {'lat' : 43.84149865, 'lon' : 10.51169066 }
locations['Orentano'] = {'lat' : 43.7796506, 'lon' : 10.6583892 }
locations['Fabbriche_di_Vallico'] = {'lat' : 43.997647, 'lon' : 10.4279  }
locations['Monte_Serra'] = {'lat' : 43.750833, 'lon' : 10.555278 }
locations['Mangona'] = {'lat' : 44.0496863, 'lon' : 11.1958797 }
locations['Le_Croci'] = {'lat' : 44.0360503, 'lon' : 11.2675661 }
locations['Cavallina'] = {'lat' : 43.9833515, 'lon' : 11.2323312 }
locations['S_Agata'] = {'lat' : 43.9438247, 'lon' : 11.3089835 }
locations['Firenze'] = {'lat' : 43.7698712, 'lon' : 11.2555757 }
locations['S_Piero'] = {'lat' : 43.9637372, 'lon' : 11.3182991 }
locations['Vernio'] = {'lat' : 44.0440508 , 'lon' : 11.1498804  }
locations['Consuma'] = {'lat' : 43.784, 'lon' : 11.585 }
locations['Croce_Arcana']  = {'lat' : 44.1323056, 'lon' : 10.7689152 }
locations['Laghetto_Verde']  = {'lat' :   42.883, 'lon' : 11.662  }

locations_df = pd.DataFrame(columns=['city', 'lat', 'lon'] )

def get_location_coordinates(df, column_type, cluster, target_df):
    for location in df.columns[df.columns.str.startswith(column_type)]:
        location = location.split(column_type)[1]

        loc_dict = {}
        loc_dict['city'] = location
        loc_dict['cluster'] = cluster
        loc_dict['type'] = column_type[:-1]
        loc_dict['lat'] = locations[location]['lat']
        loc_dict['lon'] = locations[location]['lon']

        target_df = target_df.append(loc_dict, ignore_index=True)

    return target_df

locations_df = get_location_coordinates(auser, 'Temperature_', 'auser_df', locations_df)
locations_df = get_location_coordinates(auser, 'Rainfall_', 'auser_df', locations_df)

locations_df = get_location_coordinates(doganella, 'Temperature_', 'doganella_df', locations_df)
locations_df = get_location_coordinates(doganella, 'Rainfall_', 'doganella_df', locations_df)

locations_df = get_location_coordinates(luco, 'Temperature_', 'luco_df', locations_df)
locations_df = get_location_coordinates(luco, 'Rainfall_', 'luco_df', locations_df)

locations_df = get_location_coordinates(petrignano, 'Temperature_', 'petrignano_df', locations_df)
locations_df = get_location_coordinates(petrignano, 'Rainfall_', 'petrignano_df', locations_df)

# locations_df = get_location_coordinates(lake_biliancino_df, 'Temperature_', 'lake_biliancino_df', locations_df)
# locations_df = get_location_coordinates(lake_biliancino_df, 'Rainfall_', 'lake_biliancino_df', locations_df)

# locations_df = get_location_coordinates(river_arno_df, 'Temperature_', 'river_arno_df', locations_df)
# locations_df = get_location_coordinates(river_arno_df, 'Rainfall_', 'river_arno_df', locations_df)

# locations_df = get_location_coordinates(water_spring_amiata_df, 'Temperature_', 'water_spring_amiata_df', locations_df)
# locations_df = get_location_coordinates(water_spring_amiata_df, 'Rainfall_', 'water_spring_amiata_df', locations_df)

# locations_df = get_location_coordinates(water_spring_lupa_df, 'Temperature_', 'water_spring_lupa_df', locations_df)
# locations_df = get_location_coordinates(water_spring_lupa_df, 'Rainfall_', 'water_spring_lupa_df', locations_df)

# locations_df = get_location_coordinates(water_spring_madonna_df, 'Temperature_', 'water_spring_madonna_df', locations_df)
# locations_df = get_location_coordinates(water_spring_madonna_df, 'Rainfall_', 'water_spring_madonna_df', locations_df)

# Drop duplicates
locations_df = locations_df.sort_values(by='city').drop_duplicates().reset_index(drop=True)

# Label Encode cluster feature for visualization puposes
le = LabelEncoder()
le.fit(locations_df.cluster)
locations_df['cluster_enc'] = le.transform(locations_df.cluster)

In [None]:
m = folium.Map(location=[42.6, 12.4], tiles='cartodbpositron',zoom_start=7)

colors = ['purple','lightred','green', 'lightblue', 'red', 'blue', 'darkblue','lightgreen', 'orange',  'darkgreen', 'beige',  'pink', 'darkred', 'darkpurple', 'cadetblue',]
icons = {'Temperature': 'certificate',
        'Rainfall': 'cloud'}

geolocator = Nominatim(user_agent='myapplication')
for city, gr in locations_df.groupby('city'):
    if gr.shape[0] > 1: icon = 'th-list' 
    else: icon = icons[gr.iloc[0]['type']]
    folium.Marker([gr.iloc[0].lat, 
                  gr.iloc[0].lon],
                  popup=city, 
                  icon=folium.Icon(color=colors[gr.iloc[0].cluster_enc], icon=icon)).add_to(m)
    
m

# Get additional geo data
json file was  downloaded from official website https://www.sir.toscana.it/consistenza-rete

In [None]:
import json
geo_file = json.load(open('/kaggle/input/geo-data-water-italia/geo_data.json', 'rb'))['features']

geo_dict = {}
for el in geo_file:
    sea_level = el['description'].split()
    try:
        sea_level = float(sea_level[sea_level.index('[m]</b>')+1].replace('<br', ''))
    except:
        sea_level = None
    geo_dict[el['name']] = dict(ids=el['id'], lat=el['lat'], lon=el['lon'], latlon=(el['lat'], el['lon']), sea_level=sea_level)    

geo_data = []
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features = df.columns
    features = features.str.replace('Rainfall_', '').str.replace('Depth_to_Groundwater_', '')\
    .str.replace('Temperature_', '').str.replace('Volume_', '').str.replace('Hydrometry_', '')\
    .str.replace('_', ' ')
    features = features.str.replace('Tereglio Coreglia Antelminelli', 'Tereglio')\
                        .str.replace('Lucca Orto Botanico', 'Lucca (Orto Botanico)')\
                        .str.replace('Monte S Quirico', 'Monte S.Quirico')\
                        .str.replace('Rainfall_Monticiano_la_Pineta', 'Rainfall_Monticiano_La_Pineta')\
    
    
    for f in features:
        try:
            geo_data.append(dict(**dict(aquifer=name, name=f), **geo_dict[f]))
        except:
            continue
        
geo_data = pd.DataFrame(geo_data)
geo_data['name'] = geo_data['name'].str.replace('(','').str.replace(')', '').str.replace('.',' ')

for f in geo_data['name']:
    geo_data.loc[geo_data['name']==f, 'feature_name'] = [x for x in auser.columns.tolist() + doganella.columns.tolist() + luco.columns.tolist() + petrignano.columns.tolist() 
                                                         if f.replace(' ', '_') in x ]

geo_data['type'] = geo_data.feature_name.str.split('_').apply(lambda x: x[0])
geo_data.head()

# Data preprocessing

In [None]:
# check data on missing days
for df in [auser, doganella, luco, petrignano]:
    shift = df.reset_index().date - auser.reset_index().date.shift(1)
    assert shift.value_counts().shape[0] == 1

In [None]:
get_time_features(auser)
get_time_features(doganella)
get_time_features(luco)
get_time_features(petrignano)

In [None]:
# replace 0 depth in targets with nans
features_depth = [f for f in auser.columns if 'Depth' in f]
auser[features_depth] = auser[features_depth].replace(0, np.nan)

features_depth = [f for f in luco.columns if 'Depth' in f]
luco[features_depth] = luco[features_depth].replace(0, np.nan)

# Common rainfalls analysis

In [None]:
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features_rainfall = [f for f in df.columns if 'Rainfall' in f]
    if features_rainfall.__len__() > 1:
        fig, ax = plt.subplots(figsize=(20, 30))
        df[features_rainfall].plot(subplots=True, layout=(20,2), ax=ax)
    else:
        fig, ax = plt.subplots(figsize=(10,5))
        df[features_rainfall].plot(ax=ax)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_rainfall = [f for f in df.columns if 'Rainfall' in f]
    temp = df[features_rainfall].resample('MS', ).mean()
    temp['month'] = temp.index.month
    temp.groupby('month').mean().plot(legend=False, title=name, ax=ax)
    ax.set_ylabel('Rainfall')
#     subplots=True, layout=(5,2), figsize=(10, 5*1.5));

All dependencies looks close to each other with common pattern - lowest rainfalls for summer months (6,7 and, 8). But in auser datasets there are 2 lines which are differ from other. Find them and try to find what is wrong

In [None]:
features_rainfall = features_depth = [f for f in luco.columns if 'Rainfall' in f]
temp = luco[features_rainfall].resample('MS').mean()
temp.max().sort_values()

Not commont behavior for Pentolina and Scorgiano Rainfalls in Luco

In [None]:
features_rainfall = features_depth = [f for f in luco.columns if 'Rainfall' in f]
luco[features_depth].describe()

Get rainfalls data for each rainfalls from https://www.sir.toscana.it/consistenza-rete and compare with our datasets

In [None]:
from urllib import request

In [None]:
# for 0-24 hours range
downloaded_features0_24 = pd.DataFrame(columns=['date'])
for ids in geo_data[geo_data.type=='Rainfall'].ids:
#                                 https://www.sir.toscana.it/archivio/download.php?IDST=pluvio0_24&IDS=TOS03002742
    context = request.urlopen(f'https://www.sir.toscana.it/archivio/download.php?IDST=pluvio0_24&IDS={ids}').read().decode('utf-8')
    to_add = pd.DataFrame([x.split(';') for x in context[context.find("gg/mm/aaaa") -1:].replace('@', '').replace(',','.').split('\r\n')])
    to_add = to_add.iloc[1:, :-1]
    to_add.columns = ['date', f'{geo_data.loc[geo_data.ids==ids, "feature_name"].iloc[0]}']
    
    downloaded_features0_24 = downloaded_features0_24.merge(to_add, on='date', how='outer')
    
downloaded_features0_24

In [None]:
# for 9-9 hours range of day
downloaded_features9_9 = pd.DataFrame(columns=['date'])
for ids in geo_data[geo_data.type=='Rainfall'].ids:
#                                 https://www.sir.toscana.it/archivio/download.php?IDST=pluvio0_24&IDS=TOS03002742
    context = request.urlopen(f'https://www.sir.toscana.it/archivio/download.php?IDSTpluvio&IDS={ids}').read().decode('utf-8')
    to_add = pd.DataFrame([x.split(';') for x in context[context.find("gg/mm/aaaa") -1:].replace('@', '').replace(',','.').split('\r\n')])
    to_add = to_add.iloc[1:, :-1]
    to_add.columns = ['date', f'{geo_data.loc[geo_data.ids==ids, "feature_name"].iloc[0]}']
    
    downloaded_features9_9 = downloaded_features9_9.merge(to_add, on='date', how='outer')
    
downloaded_features9_9

In [None]:
downloaded_features0_24.date = pd.to_datetime(downloaded_features0_24.date, dayfirst=True)
downloaded_features0_24.set_index('date', inplace=True)
downloaded_features0_24.sort_index(inplace=True)
downloaded_features0_24 = downloaded_features0_24.iloc[:-1]
downloaded_features0_24 = downloaded_features0_24.replace('', np.nan).astype('float')

downloaded_features9_9.date = pd.to_datetime(downloaded_features9_9.date, dayfirst=True)
downloaded_features9_9.set_index('date', inplace=True)
downloaded_features9_9.sort_index(inplace=True)
downloaded_features9_9 = downloaded_features9_9.iloc[:-1]
downloaded_features9_9 = downloaded_features9_9.replace('', np.nan).astype('float')

In [None]:
# compare 3 datasets
temp = downloaded_features0_24[['Rainfall_Pentolina']].merge(downloaded_features9_9[['Rainfall_Pentolina']], left_index=True, right_index=True)
temp.columns = ['0_24', '9_9']
temp = temp.merge(luco.Rainfall_Pentolina, left_index=True, right_index=True)
temp.plot(subplots=True)
temp.plot();

In [None]:
# get differences between downloaded data and our
temp_diff = pd.concat(((temp.Rainfall_Pentolina - temp['0_24']), (temp.Rainfall_Pentolina - temp['9_9'])), axis=1)
temp_diff.columns = ['0_24', '9_9']
temp2 = temp_diff.groupby([temp.index.year, temp.index.month]).mean().dropna(how='all')
assert temp2[temp2!=0]['9_9'].value_counts().shape[0] == 0
temp2[temp2!=0]['0_24'].plot.hist()

In [None]:
# find rainfall features, for which datasets are differ
treshold = 20

for f in downloaded_features0_24.columns:
    temp = downloaded_features0_24[[f]].merge(downloaded_features9_9[f], left_index=True, right_index=True)
    temp.columns = ['0_24', '9_9']
    for df in [auser, doganella, luco, petrignano]:
        try:
            temp = temp.merge(df[f], left_index=True, right_index=True)
            break
        except KeyError:
            continue
    
    temp_diff = pd.concat(((temp[f] - temp['0_24']), (temp[f] - temp['9_9'])), axis=1)
    temp_diff.columns = ['0_24', '9_9']
    temp2 = temp_diff.groupby([temp.index.year, temp.index.month]).mean().dropna(how='all')
    for new_f in ['0_24', '9_9']:
        if temp2[temp2.abs() >= treshold][new_f].dropna().shape[0] > 0:
            print(f'{f} differs from {new_f}')

In [None]:
#  replace our data with outliers on downloaded one
for f in ['Rainfall_Scorgiano', 'Rainfall_Pentolina']:
    luco = luco.merge(downloaded_features0_24[f], left_index=True, right_index=True, how='left', suffixes=['_drop', ''])
    luco.drop(f'{f}_drop', axis=1, inplace=True)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_rainfall = [f for f in df.columns if 'Rainfall' in f]
    temp = df[features_rainfall].resample('MS', ).mean()
    temp['month'] = temp.index.month
    temp.groupby('month').mean().plot(legend=False, title=name, ax=ax)

# Windows for ewm in Corr Rainfall

#### Auser

In [None]:
features_rainfall = [f for f in auser.columns if 'Rainfall' in f and 'cumsum' not in 'f' and 'sum' not in f and 'ratio' not in f] 
features_depth = [f for f in auser.columns if 'Depth' in f]
res_windows = plot_window_corr(auser, features_depth, features_rainfall, ewm_max=300)

In [None]:
_ = plot_window_corr(auser, [features_depth[0]], features_rainfall, ewm_max=3000)

In [None]:
res_windows, style = format_res_windows(res_windows,features_depth, features_rainfall)
res_windows.iloc[0] = pd.DataFrame(_).lag.values
style

In [None]:
auser_df = auser[features_depth].copy()
auser_df['year'] = auser_df.index.year
auser_df['month'] = auser_df.index.month

# save rainfall features names (with window size) for different wells
rainfall_windows = {f_main: {f_lag:res_windows.loc[f_main, f_lag]
                                           for f_lag in features_rainfall}
                                 for f_main in features_depth
                   }

for f_main, f_lag in product(features_depth, [f for f in features_rainfall]):
    window = res_windows.loc[f_main, f_lag]
    auser_df[f'{f_lag}_{window}'] = auser[f_lag].ewm(window, min_periods=65).mean()

In [None]:
rainfall_corr = plot_corr_for_targets(features_depth, features_rainfall, res_windows, auser_df)

In [None]:
img = plt.imread('/kaggle/input/geo-data-water-italia/map.png')
fig, axes = plt.subplots(1, 5, figsize = (8*5,8))
# fig.subplots_adjust(wspace=0.01)

borders = (10.4, 10.8, 43.7, 44.2)
my_cmap = sns.light_palette("Navy", as_cmap=True) #sns.color_palette("coolwarm", as_cmap=True)

geo_rainfall = geo_data[(geo_data['type']=='Rainfall') & (geo_data.aquifer =='auser')]
for f_main, ax in zip(features_depth, axes.ravel()):
    ax.set_xlim(borders[0],borders[1])
    ax.set_ylim(borders[2],borders[3])
    
    lons, lats, corrs, labels  = [], [], [], []
    for i,point in geo_rainfall.iterrows():
        lons.append(point.lon)
        lats.append(point.lat)
        f_lag = point.feature_name
        idx = (rainfall_corr.f_main == f_main) & (rainfall_corr['f_lag'].apply(lambda x: x.find(point.feature_name)!=-1))  # take current f_lag
        corrs.append(rainfall_corr[idx]['corr'].iloc[0])
        labels.append(i)
        
    ax.scatter(lons, lats, #zorder=1, 
               c=5000*np.array(corrs), s=5000*np.array(corrs), label=labels, cmap=my_cmap)
    autolabel(geo_rainfall.lon, geo_rainfall.lat, geo_rainfall['name'])
    autolabel(lons, np.array(lats) - 0.015, [round(corr, 2) for corr in corrs])
    
    ax.set_title(f_main)
    ax.imshow(img, zorder=0, extent=borders, aspect='equal',
               alpha=0.75)


Now, one can see difference in influencing on depth by rainfalls for south (LT2) and north wells (other)

In [None]:
# temp = rainfall_corr.copy()
# temp['f_lag'] = temp.f_lag.str.split('_').apply(lambda x: '_'.join(x[:-1]))

# temp_merge = res_windows.unstack().reset_index().rename(columns={0:'window'}).merge(temp, on=['f_main', 'f_lag'], how='outer')
# temp_merge['f_main'] = temp_merge['f_main'].astype('category').cat.codes
# temp_merge.plot(kind='scatter', x='window', y='corr', c='f_main', colormap='viridis', xlim=(40, 200))

In [None]:
# dependences of depth on rainfall by months
plot_dependencies_by_month(features_depth, features_rainfall, auser_df, rainfall_windows)

8 columns - Rainfall Croce Arcana - dependencies are differ from other - big difference between 12 and 1 month (december and january)

## Doganella

In [None]:
features_rainfall = [f for f in doganella.columns if 'Rainfall' in f and 'cumsum' not in 'f' and 'sum' not in f and 'ratio' not in f] 
features_depth = [f for f in doganella.columns if 'Depth' in f]
res_windows = plot_window_corr(doganella, features_depth, features_rainfall, ewm_max=2000)

In [None]:
res_windows, style = format_res_windows(res_windows,features_depth, features_rainfall)
style

In [None]:
doganella_df = doganella[features_depth].copy()
doganella_df['year'] = doganella_df.index.year
doganella_df['month'] = doganella_df.index.month

for f_main, f_lag in product(features_depth, [f for f in features_rainfall]):
    window = res_windows.loc[f_main, f_lag]
    doganella_df[f'{f_lag}_{window}'] = doganella[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
rainfall_windows.update({f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_rainfall}
                                 for f_main in features_depth
                   })

In [None]:
rainfall_corr = plot_corr_for_targets(features_depth, features_rainfall, res_windows, doganella_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_rainfall, doganella_df, rainfall_windows)

## Luco

In [None]:
features_rainfall = [f for f in luco.columns if 'Rainfall' in f and 'cumsum' not in 'f' and 'sum' not in f and 'ratio' not in f] 
features_depth = [f for f in luco.columns if 'Depth' in f]
res_windows = plot_window_corr(luco, features_depth, features_rainfall, ewm_max=1500)

In [None]:
res_windows, style = format_res_windows(res_windows,features_depth, features_rainfall)
res_windows.iloc[-1,1] = 0
style

In [None]:
luco_df = luco[features_depth].copy()
luco_df['year'] = luco_df.index.year
luco_df['month'] = luco_df.index.month

for f_main, f_lag in product(features_depth, [f for f in features_rainfall]):
    window = res_windows.loc[f_main, f_lag]
    luco_df[f'{f_lag}_{window}'] = luco[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
rainfall_windows.update({f_main.replace('Pozzo', 'Pozzo_luco'): {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_rainfall}
                                 for f_main in features_depth
                   })

In [None]:
rainfall_corr = plot_corr_for_targets(features_depth, features_rainfall, res_windows, luco_df)

In [None]:
# temp = rainfall_corr.copy()
# temp['f_lag'] = temp.f_lag.str.split('_').apply(lambda x: '_'.join(x[:-1]))

# temp_merge = res_windows.unstack().reset_index().rename(columns={0:'window'}).merge(temp, on=['f_main', 'f_lag'], how='outer')
# temp_merge['f_main'] = temp_merge['f_main'].astype('category').cat.codes
# temp_merge.plot(kind='scatter', x='window', y='corr', c='f_main', colormap='viridis')

In [None]:
# plot_dependencies_by_month(features_depth, features_rainfall, luco_df, rainfall_windows)

## Petrignano

In [None]:
features_rainfall = [f for f in petrignano.columns if 'Rainfall' in f and 'cumsum' not in 'f' and 'sum' not in f and 'ratio' not in f] 
features_depth = [f for f in petrignano.columns if 'Depth' in f]
res_windows =  plot_window_corr(petrignano, features_depth, features_rainfall, ewm_max=200)

In [None]:
res_windows, style = format_res_windows(res_windows,features_depth, features_rainfall)
style

In [None]:
petrignano_df = petrignano[features_depth].copy()
petrignano_df['year'] = petrignano_df.index.year
petrignano_df['month'] = petrignano_df.index.month

for f_main, f_lag in product(features_depth, [f for f in features_rainfall]):
    window = res_windows.loc[f_main, f_lag]
    petrignano_df[f'{f_lag}_{window}'] = petrignano[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
rainfall_windows.update({f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_rainfall}
                                 for f_main in features_depth
                   })

In [None]:
_ = plot_corr_for_targets(features_depth, features_rainfall, res_windows, petrignano_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_rainfall, petrignano_df, rainfall_windows)

In [None]:
import pickle 
pickle.dump(rainfall_windows, open('rainfall_windows.pkl', 'wb'))

# Common volume analysis

In [None]:
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features_volume = [f for f in df.columns if 'Volume' in f]
    if features_volume.__len__() > 1:
        fig, ax = plt.subplots(figsize=(20, 30))
        df[features_volume].plot(subplots=True, layout=(20,2), ax=ax)
    else:
        fig, ax = plt.subplots(figsize=(10,5))
        df[features_volume].plot(ax=ax)

In [None]:
features_volume = [f for f in auser.columns if 'Volume' in f]
auser[features_volume] = auser[features_volume].replace(0, np.nan)

In [None]:
import seaborn as sns

fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))
for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_volume = [f for f in df.columns if 'Volume' in f]
    sns.boxplot(data=df[features_volume].unstack().reset_index(), x=0, y='level_0', ax=ax)
    ax.set_title(name)

In [None]:
features_volume = [f for f in doganella.columns if 'Volume' in f]
doganella[features_volume] = -doganella[features_volume]

In [None]:
import scipy.stats as ss

volume = []
for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    volume.extend([f for f in df.columns if 'Volume' in f])
    
cols = 5
rows = len(volume) // cols
rows  = rows + 1 if len(volume) % cols != 0 else rows

fig, axes = plt.subplots(rows, cols, figsize=(cols*3, rows*3))
for ax, v in zip(axes.ravel(), volume):
    for df, name in zip([auser, doganella, luco, petrignano],
                        ['auser', 'doganella', 'luco', 'petrignano']):
        try:
            ss.probplot(df[v].dropna(), plot=ax)
            ax.set_title(f'{v.replace("Volume_", "")}')
            break
        except KeyError:
            continue

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_volume = [f for f in df.columns if 'Volume' in f]
    temp = df[features_volume].resample('MS', ).mean()
    temp['month'] = temp.index.month
    temp.groupby('month').mean().plot(legend=True, title=name, ax=ax)
    ax.set_ylabel('Volume')


In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_volume = [f for f in df.columns if 'Volume' in f]
    df.groupby('month')[features_volume].median().plot(legend=True, title=name, ax=ax)
    ax.set_ylabel('Volume')


In [None]:
# plot standatrized volumes
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_volume = [f for f in df.columns if 'Volume' in f]
    temp = df[features_volume].copy()
#     temp = df[features_volume].resample('MS', ).mean().abs()
    temp[features_volume] = (temp[features_volume] - temp[features_volume].mean())/temp[features_volume].std()
    temp['month'] = temp.index.month
    temp.groupby('month').median().plot(legend=True, title=name, ax=ax)
    ax.set_ylabel('Volume')
fig.suptitle('Standartized volume by months');

# Windows for ewm in Corr Volume
## Auser

In [None]:
features_volume = [f for f in auser.columns if 'Volume' in f ] 
features_depth = [f for f in auser.columns if 'Depth' in f]
res_windows =  plot_window_corr(auser, features_depth, features_volume, ewm_max=2500)

In [None]:
res_windows, style = format_res_windows(res_windows,features_depth, features_volume)
res_windows.iloc[-1, 2] = 0
style

In [None]:
for f_main, f_lag in product(features_depth, [f for f in features_volume]):
    window = res_windows.loc[f_main, f_lag]
    auser_df[f'{f_lag}_{window}'] = auser[f_lag].ewm(window, min_periods=65).mean()
    
volume_windows = {f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_volume}
                                 for f_main in features_depth
                   }

In [None]:
_ = plot_corr_for_targets(features_depth, features_volume, res_windows, auser_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_volume, auser_df, volume_windows)

## Doganella

In [None]:
features_volume = [f for f in doganella.columns if 'Volume' in f ] 
features_depth = [f for f in doganella.columns if 'Depth' in f]
res_windows =  plot_window_corr(doganella, features_depth, features_volume, ewm_max=2000)

res_windows, style = format_res_windows(res_windows,features_depth, features_volume)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_volume]):
    window = res_windows.loc[f_main, f_lag]
    doganella_df[f'{f_lag}_{window}'] = doganella[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
volume_windows.update({f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_volume}
                                 for f_main in features_depth
                   })

_ = plot_corr_for_targets(features_depth, features_volume, res_windows, doganella_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_volume, doganella_df, volume_windows)

## Luco

In [None]:
features_volume = [f for f in luco.columns if 'Volume' in f ] 
features_depth = [f for f in luco.columns if 'Depth' in f]
res_windows =  plot_window_corr(luco, features_depth, features_volume, ewm_max=1000)

res_windows, style = format_res_windows(res_windows,features_depth, features_volume)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_volume]):
    window = res_windows.loc[f_main, f_lag]
    luco_df[f'{f_lag}_{window}'] = luco[f_lag].ewm(window, min_periods=65).mean()
    
volume_windows.update({f_main.replace('Pozzo', 'Pozzo_luco'): {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_volume}
                                 for f_main in features_depth
                   })

_ = plot_corr_for_targets(features_depth, features_volume, res_windows, luco_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_volume, luco_df, volume_windows)

## Petrignano

In [None]:
features_volume = [f for f in petrignano.columns if 'Volume' in f ] 
features_depth = [f for f in petrignano.columns if 'Depth' in f]
res_windows =  plot_window_corr(petrignano, features_depth, features_volume, ewm_max=200)

res_windows, style = format_res_windows(res_windows,features_depth, features_volume)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_volume]):
    window = res_windows.loc[f_main, f_lag]
    petrignano_df[f'{f_lag}_{window}'] = petrignano[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
volume_windows.update({f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_volume}
                                 for f_main in features_depth
                   })

_ = plot_corr_for_targets(features_depth, features_volume, res_windows, petrignano_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_volume, petrignano_df, volume_windows)

In [None]:
pickle.dump(volume_windows, open('volume_windows.pkl', 'wb'))

# Common analysis of Temperature

In [None]:
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features_temperature = [f for f in df.columns if 'Temperature' in f]
    if features_temperature.__len__() > 1:
        fig, ax = plt.subplots(figsize=(20, 30))
        df[features_temperature].plot(subplots=True, layout=(20,2), ax=ax)
    else:
        fig, ax = plt.subplots(figsize=(10,5))
        df[features_temperature].plot(ax=ax)

In [None]:
temperatures = []
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    temperatures.extend([f for f in df.columns if 'Temperature' in f])
    
fig ,axes = plt.subplots(6,2, figsize=(20,20))

for ax, f in zip(axes.ravel(), temperatures):
    for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
        try:
            temp = df[f].copy().to_frame()
            temp['is0'] = False
            temp['is0'][temp[f].notna()] = temp[f].dropna().rolling(2).mean() == 0
            temp[f].plot(ax=ax, title=f)
            ax.scatter(temp[temp.is0].index, temp[temp.is0][f], c='r')
            break
        except KeyError:
            continue

In [None]:
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features_temp = [f for f in df.columns if 'Temperature' in f]
    for f in features_temp:
        temp = df[f].copy().to_frame()
        temp['is0'] = False
        temp['is0'][temp[f].notna()] = temp[f].dropna().rolling(2).mean() == 0
        df[f][temp.is0] = np.nan


In [None]:
import seaborn as sns

fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))
for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_temp = [f for f in df.columns if 'Temperature' in f]
    sns.boxplot(data=df[features_temp].unstack().reset_index(), x=0, y='level_0', ax=ax)
    ax.set_title(name)

In [None]:
import scipy.stats as ss

temperature = []
for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    temperature.extend([f for f in df.columns if 'Temperature' in f])
    
cols = 5
rows = len(temperature) // cols
rows  = rows + 1 if len(temperature) % cols != 0 else rows

fig, axes = plt.subplots(rows, cols, figsize=(cols*3, rows*3))
for ax, v in zip(axes.ravel(), temperature):
    for df, name in zip([auser, doganella, luco, petrignano],
                        ['auser', 'doganella', 'luco', 'petrignano']):
        try:
            ss.probplot(df[v].dropna(), plot=ax)
            ax.set_title(f'{v.replace("Temperature_", "")}')
            break
        except KeyError:
            continue

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_temperature = [f for f in df.columns if 'Temperature' in f]
    df.groupby('month')[features_temperature].mean().plot(legend=True, title=name, ax=ax, style='-o')
    ax.set_ylabel('Temperature')

# Windows for ewm in Corr Temperature
## Auser

In [None]:
features_temperature = [f for f in auser.columns if 'Temperature' in f ] 
features_depth = [f for f in auser.columns if 'Depth' in f]
res_windows =  plot_window_corr(auser, features_depth, features_temperature, trend='negative', ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_temperature)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_temperature]):
    window = res_windows.loc[f_main, f_lag]
    auser_df[f'{f_lag}_{window}'] = auser[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
temperature_windows = {f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_temperature}
                                 for f_main in features_depth
                   }

_ = plot_corr_for_targets(features_depth, features_temperature, res_windows, auser_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_temperature, auser_df, temperature_windows)

## Doganella

In [None]:
features_temperature = [f for f in doganella.columns if 'Temperature' in f ] 
features_depth = [f for f in doganella.columns if 'Depth' in f]
res_windows =  plot_window_corr(doganella, features_depth, features_temperature, trend='negative',  ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_temperature)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_temperature]):
    window = res_windows.loc[f_main, f_lag]
    doganella_df[f'{f_lag}_{window}'] = doganella[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
temperature_windows.update({f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_temperature}
                                 for f_main in features_depth
                   })

_ = plot_corr_for_targets(features_depth, features_temperature, res_windows, doganella_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_temperature, doganella_df, temperature_windows)

## Luco

In [None]:
features_temperature = [f for f in luco.columns if 'Temperature' in f ] 
features_depth = [f for f in luco.columns if 'Depth' in f]
res_windows =  plot_window_corr(luco, features_depth, features_temperature, trend='negative',  ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_temperature)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_temperature]):
    window = res_windows.loc[f_main, f_lag]
    luco_df[f'{f_lag}_{window}'] = luco[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
temperature_windows.update({f_main.replace('Pozzo', 'Pozzo_luco'): {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_temperature}
                                 for f_main in features_depth
                   })

_ = plot_corr_for_targets(features_depth, features_temperature, res_windows, luco_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_temperature, luco_df, temperature_windows)

## Petrignano

In [None]:
features_temperature = [f for f in petrignano.columns if 'Temperature' in f ] 
features_depth = [f for f in petrignano.columns if 'Depth' in f]
res_windows =  plot_window_corr(petrignano, features_depth, features_temperature, trend='negative', ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_temperature)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_temperature]):
    window = res_windows.loc[f_main, f_lag]
    petrignano_df[f'{f_lag}_{window}'] = petrignano[f_lag].ewm(window, min_periods=65).mean()
    
# save rainfall features names (with window size) for different wells
temperature_windows.update({f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_temperature}
                                 for f_main in features_depth
                   })

_ = plot_corr_for_targets(features_depth, features_temperature, res_windows, petrignano_df)

In [None]:
# plot_dependencies_by_month(features_depth, features_temperature, petrignano_df, temperature_windows)

In [None]:
pickle.dump(temperature_windows, open('temperature_windows.pkl', 'wb'))

# Common analysis of Hydrometry

In [None]:
for df, name in zip([auser, doganella, luco, petrignano],
                    ['auser', 'doganella', 'luco', 'petrignano']):
    features_hydrometry = [f for f in df.columns if 'Hydrometry' in f]
    if features_hydrometry.__len__() > 1:
        fig, ax = plt.subplots(figsize=(20, 30))
        df[features_hydrometry].plot(subplots=True, layout=(20,2), ax=ax)
        print(name)
    elif features_hydrometry.__len__() == 1:
        fig, ax = plt.subplots(figsize=(10,5))
        df[features_hydrometry].plot(ax=ax)
        print(name)
    else:
        continue

In [None]:
features_hydrometry = [f for f in df.columns if 'Hydrometry' in f]
petrignano[features_hydrometry] = petrignano[features_hydrometry].replace(0, np.nan)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))
for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_hydro = [f for f in df.columns if 'Hydrometry' in f]
    if len(features_hydro) >= 1:
        sns.boxplot(data=df[features_hydro].unstack().reset_index(), x=0, y='level_0', ax=ax)
        ax.set_title(name)
    

In [None]:
hydrometry = []
for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    hydrometry.extend([f for f in df.columns if 'Hydrometry' in f])
    
cols = 5
rows = len(hydrometry) // cols
rows  = rows + 1 if len(hydrometry) % cols != 0 else rows

fig, axes = plt.subplots(rows, cols, figsize=(cols*3, rows*3))
for ax, v in zip(axes.ravel(), hydrometry):
    for df, name in zip([auser, doganella, luco, petrignano],
                        ['auser', 'doganella', 'luco', 'petrignano']):
        try:
            ss.probplot(df[v].dropna(), plot=ax)
            ax.set_title(f'{v.replace("Hydrometry_", "")}')
            break
        except KeyError:
            continue

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(8*4, 5))

for ax, df, name in zip(axes.ravel(), [auser, doganella, luco, petrignano],
                       ['auser', 'doganella', 'luco', 'petrignano']):
    features_hydro = [f for f in df.columns if 'Hydrometry' in f]
    if len(features_hydro) >= 1:
        df.groupby('month')[features_hydro].median().plot(legend=True, title=name, ax=ax, style='-o')
        ax.set_ylabel('Hydrometry')

# Windows for ewm in Corr Hydrometry
## Auser

In [None]:
features_hydrometry = [f for f in auser.columns if 'Hydrometry' in f ] 
features_depth = [f for f in auser.columns if 'Depth' in f]
res_windows =  plot_window_corr(auser, features_depth, features_hydrometry,  ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_hydrometry)
res_windows[res_windows == -1] = 0
temp =  plot_window_corr(auser, [features_depth[0]], [features_hydrometry[1]], trend='negative',  ewm_max=1500)
res_windows.iloc[0, 1] = temp[0]['lag']

display(style)

for f_main, f_lag in product(features_depth, [f for f in features_hydrometry]):
    window = res_windows.loc[f_main, f_lag]
    auser_df[f'{f_lag}_{window}'] = auser[f_lag].ewm(window, min_periods=65).mean()
    
hydrometry_windows = {f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_hydrometry}
                                 for f_main in features_depth
                   }

_ = plot_corr_for_targets(features_depth, features_hydrometry, res_windows, auser_df)

## Petrignano

In [None]:
features_hydrometry = [f for f in petrignano.columns if 'Hydrometry' in f ] 
features_depth = [f for f in petrignano.columns if 'Depth' in f]
res_windows =  plot_window_corr(petrignano, features_depth, features_hydrometry,  ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_hydrometry)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_hydrometry]):
    window = res_windows.loc[f_main, f_lag]
    petrignano_df[f'{f_lag}_{window}'] = petrignano[f_lag].ewm(window, min_periods=65).mean()
    
hydrometry_windows.update({f_main: {f_lag: res_windows.loc[f_main, f_lag]
                                           for f_lag in features_hydrometry}
                                 for f_main in features_depth
                   })

_ = plot_corr_for_targets(features_depth, features_hydrometry, res_windows, petrignano_df)

In [None]:
pickle.dump(hydrometry_windows, open('hydro_windows.pkl', 'wb'))

# Windows for Corr unpredicted Targets

In [None]:
features_depth = [f for f in auser.columns if 'Depth' in f]
features_depth2 = ['Depth_to_Groundwater_PAG',  'Depth_to_Groundwater_DIEC']
for f in features_depth2:
    features_depth.remove(f)

res_windows =  plot_window_corr(auser, features_depth, features_depth2, ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_depth2)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_depth2]):
    window = res_windows.loc[f_main, f_lag]
    auser_df[f'{f_lag}_{window}'] = auser[f_lag].ewm(window, min_periods=65).mean()
    
depth_windows = {f_main: {f_lag: res_windows.loc[f_main, f_lag]
                          for f_lag in features_depth2}
                for f_main in features_depth
                }

_ = plot_corr_for_targets(features_depth, features_depth2, res_windows, auser_df)

In [None]:
features_depth = [f for f in luco.columns if 'Depth' in f]
features_depth2 = ['Depth_to_Groundwater_Pozzo_1', 'Depth_to_Groundwater_Pozzo_3',
                   'Depth_to_Groundwater_Pozzo_4']
for f in features_depth2:
    features_depth.remove(f)

res_windows =  plot_window_corr(luco, features_depth, features_depth2, trend='negative', ewm_max=500)

res_windows, style = format_res_windows(res_windows,features_depth, features_depth2)
res_windows[res_windows == -1] = 0
display(style)

for f_main, f_lag in product(features_depth, [f for f in features_depth2]):
    window = res_windows.loc[f_main, f_lag]
    luco_df[f'{f_lag}_{window}'] = luco[f_lag].ewm(window, min_periods=65).mean()
    
depth_windows.update({f_main.replace('Pozzo', 'Pozzo_luco'): {f_lag: res_windows.loc[f_main, f_lag]
                          for f_lag in features_depth2}
                for f_main in features_depth
                })

# _ = plot_corr_for_targets([features_depth], features_depth2, res_windows, luco_df)

In [None]:
pickle.dump(depth_windows, open('depth_windows.pkl', 'wb'))