In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import datetime

from collections import defaultdict

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error

from skopt import gp_minimize
from skopt.space import Real, Integer

from graphviz import Digraph
from IPython.display import SVG

import warnings

from lightgbm import LGBMRegressor

In [None]:
# fix the date in a dataframe (pandas does not read it in correctly by default), set it as index
def fix_date(df):
    df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y")
    df.set_index('Date', drop=False, inplace=True)
    df['dayofyear'] = df['Date'].dt.dayofyear

In [None]:
# The main rainfall effect model

def rainfall_effect(
    # data parameters
    rain_series, # pd series where index is date and value is amount of rain which fell that day
    start_date = None,
    end_date = None,
    # simulation parameters
    fraction_retained = 0.9, # fraction of water retained each day (vs. fraction that is carried away elsewhere - pooling, transpiration, etc.)
    first_day_flow = 0.004, # what fraction of the rain takes effect on the first day
    funnel_start_width = 0.0, # when 0, funnel is cone-shaped; when large, funnel is closer to cyllinder-shaped.
    time_gap = 0, # integer(days) - how long does it take even the first water to reach the area of interest
):
    # calculate default start and end date
    if start_date is None:
        start_date = rain_series.first_valid_index()
    if end_date is None:
        end_date = rain_series.last_valid_index()
    
    # calculate flow speed per "funnel unit"
    first_day_area = funnel_start_width+1.0
    flow_speed = first_day_flow/first_day_area
    
    # total rain "taking effect" on a given day
    rain_effect = defaultdict(int)
    
    # process rain coming in each day
    current_date = start_date
    while current_date <= end_date:
        rain = 0
        if current_date in rain_series.index:
            rain = rain_series[current_date]
        
        # start with explicit 0 for each input date      
        if current_date not in rain_effect:
            rain_effect[current_date] = 0
            
        # iterate through upcoming days and calculate effect of rain which reaches the body *on that day*
        retained_remaining = rain # this variable keeps track of the effect of retention/drainage (but ignores actual outflow)
        # effectively,it is used to infer how much water is left which originated on a given "daily level" of the funnel.
        total_water_remaining = rain # actual amount of rain remaining "un-claimed"
        current_area_factor = first_day_area
        rain_effect_date = current_date + datetime.timedelta(int(time_gap))
        while retained_remaining >= 0.01 and total_water_remaining >= 0.01 and rain_effect_date <= end_date:
            water_out = current_area_factor*flow_speed*retained_remaining
            water_out = min(water_out, total_water_remaining)
        
            # update totals
            rain_effect[rain_effect_date] += water_out
            total_water_remaining -= water_out
            
            # update running state variables
            retained_remaining *= fraction_retained
            total_water_remaining *= fraction_retained
            current_area_factor += 2
            rain_effect_date += datetime.timedelta(1)
            
        current_date += datetime.timedelta(1)
    
    return rain_effect

In [None]:
# error metric: global error variance
def global_error_var(correct, pred):
    return (correct-pred).var(ddof=0)

In [None]:
# Helper functions to be able to define more elaborate priors for gp_optimize than just "uniform" and "loguniform"
 
# reverse a log-uniform prior so that bigger values are more likely 
def make_inverse_loguniform_prior(name, lower=None, upper=None):
    
    if lower is None:
        lower = 0 + np.finfo(float).eps
    if upper is None:
        upper = 1 - np.finfo(float).eps
        
    def convert(x):
        return upper-x
    
    dimension = Real(0 + np.finfo(float).eps, upper-lower, name=name, prior='log-uniform')
    
    return convert, dimension

# set up a dimension such that the resulting converted variable will have a logistic distribution 
# with given s, mu, and lower/upper bounds.
def make_logit_prior(name, s = 1, m = 0, lower=None, upper=None):

    # (0+np.finfo(float).eps)
    
    # x should be between 0 and 1
    def convert_using_logit(x):
        return m+np.log(x/(1-x))*s
    
    lower_x = 0 + np.finfo(float).eps
    if lower is not None:
        lower += np.finfo(float).eps
        lower_x = 1/(1+np.exp((m-lower)/s))
    upper_x = 1 - np.finfo(float).eps
    if upper is not None:
        upper -= np.finfo(float).eps
        upper_x = 1/(1+np.exp((m-upper)/s))
    
    dimension = Real(lower_x, upper_x, name=name)
    
    return convert_using_logit, dimension

In [None]:
# wrap rain effect calculation into a format which can be plugged into gp_optimize, with custom priors.
def make_rain_func_and_dimensions(
    rain_series, 
    retained_prior = None,
    flow_prior = None,
    width_prior = None,
    lag_prior = None,
    verbose=True
):
    # TODO: take  in priors?..
    
    # How to convert given input variable (generate defaults if not provided in parameters)
    if retained_prior is None:
        # retained_conv, retained_dim = make_inverse_loguniform_dim('fraction_retained')
        retained_conv, retained_dim = make_logit_prior('fraction_retained', s=0.3, m=0.9, lower=0.0, upper=1.0)
    else:
        retained_conv, retained_dim = retained_prior
        
    if flow_prior is None:
        # we set the lower bound to 0.0001 for practical reasons: lower values don't make much of a cumulative effect, 
        # but they do take a long time to compute because the effect of each day's rainfall is spread out over many more days.
        flow_conv, flow_dim = make_logit_prior('first_day_flow', s=0.05, m=0.01, lower=0.0001, upper=1.0) #m=0.1, s=0.1?
    else:
        flow_conv, flow_dim = flow_prior
        
    if width_prior is None:
        width_conv, width_dim = make_logit_prior('funnel_start_width', s=40.0, m=180.0, lower=0.0) # m=270?
    else:
        width_conv, width_dim = width_prior
    
    if lag_prior is None:
        lag_conv = lambda x: x
        lag_dim = Integer(0,10, name='time_gap')
    else:
        lag_conv, lag_dim = lag_prior
    
    conversions = [retained_conv, flow_conv, width_conv, lag_conv]
    dimensions = [retained_dim, flow_dim, width_dim, lag_dim]
    
    def convert(x):
        return [conversions[i](x_i) for (i, x_i) in enumerate(x)]
    
    def calc_rain_from_vector(x):
        fraction_retained, first_day_flow, funnel_start_width, time_gap = convert(x)
        
        if verbose:
            print('inputs:', fraction_retained, first_day_flow, funnel_start_width, time_gap)
            
        return pd.Series(rainfall_effect(
            # data parameters
            rain_series,
            # simulation parameters
            fraction_retained = fraction_retained,
            first_day_flow = first_day_flow,
            funnel_start_width = funnel_start_width,
            time_gap=time_gap
        ))

    return calc_rain_from_vector, dimensions, convert

In [None]:
# use Baysian optimization with linear regresssion to fit ML model of rain effects, plus any set of linear-effect parameters.
def fit_rain_effects(
    # inputs
    rains_df, 
    ground_truth, 
    make_rain_func=make_rain_func_and_dimensions,
    additional_fields=None, # additional fields to throw into linear regression
    # options
    error_func = global_error_var,
    spinup=30, # TODO: use?..
    verbose=True,
    nrandom=20,
    ntotal=100,
    x0=None, # optional input point(s) to try for gp_minimize; e.g. best overall results of previous runs
):
    calcs = []
    rain_names = []
    converts = []
    all_dims = []
    dims_per_rain = 0
    for rain_name, rain_series in rains_df.iteritems():
        calc, dims, convert = make_rain_func(rain_series, verbose=verbose)
        dims_per_rain = len(dims)
        calcs.append(calc)
        converts.append(convert)
        rain_names.append(rain_name)
        all_dims += dims
        
    reg_fields =rain_names
    if additional_fields is not None:
        reg_fields += list(additional_fields.columns)
    
    optimal_error = None
    optimal_linreg = None
    optimal_n = None
    
    def calculate_error(x):
        prediction_frame = ground_truth.to_frame(name='ground_truth')
        
        for i, rain_calc in enumerate(calcs):
            if(verbose):
                print(rain_names[i])
            rain_result = rain_calc(x[dims_per_rain*i:dims_per_rain*(i+1)])
            prediction_frame[rain_names[i]] = rain_result
            prediction_frame.loc[(prediction_frame['ground_truth'].notnull()) & (prediction_frame[rain_names[i]].isnull()), rain_names[i]] = 0

        if additional_fields is not None:
            prediction_frame[list(additional_fields.columns)] = additional_fields
                
        without_nulls = prediction_frame.dropna().copy()
        
        reg = LinearRegression().fit(without_nulls[reg_fields], without_nulls['ground_truth'])
        if verbose:
            print('rescale parameters:', reg.coef_, reg.intercept_)

        without_nulls['pred'] = reg.predict(without_nulls[reg_fields])
        
        error = error_func(without_nulls['ground_truth'], without_nulls['pred'])
        if verbose:
            print('error value:', error)
            print()
            
        nonlocal optimal_error, optimal_linreg, optimal_n
        if optimal_error is None or error < optimal_error:
            optimal_error = error
            optimal_linreg = reg
            optimal_n = len(without_nulls)
        
        return error
        
    res = gp_minimize(
        calculate_error,  # function to minimize
        all_dims,             # dimension configuration
        acq_func="gp_hedge",    # acquisition function (PI = optimize probability of reducing error; 'gp_hedge' - guess/vary)
        n_calls=ntotal,      # number of evaluations of f
        n_random_starts=nrandom, # first n calls are random (avoid local minima) 
        x0=x0, # input points to definitely try
    )  
    
    additional_names = []
    if additional_fields is not None:
        additional_names = list(additional_fields.columns)
    
    return generate_prediction_function(rain_names, additional_names, res, converts, optimal_linreg, optimal_n)

In [None]:
# Calculate BIC from error variance (making the gaussian assumption)
def get_bic(errvar, n, k):
    return n*np.log(errvar) + k*np.log(n)

In [None]:
# given the outputs of a rain effect model fit, return a function which will generate the predictions based on that model.
def generate_prediction_function(rain_names, additional_field_names, fit_result, conversions, optimal_linreg, training_n, verbose=True):
    
    # get converted parameters for rain effect calculations
    dims_per_rain = len(fit_result.x)//len(conversions)
    all_rain_params = []
    for i, convert in enumerate(conversions):
        params = convert(fit_result.x[dims_per_rain*i:dims_per_rain*(i+1)])
        all_rain_params.append(params)
    
    if verbose:
        for name, params in zip(rain_names, all_rain_params):
            print(f'Parameters for {name}: {params}')
        print('Scaling:')
        for name, coef in zip(rain_names+additional_field_names, optimal_linreg.coef_):
            print(f'  {name}: {coef}')
        print(f'Translation parameter: {optimal_linreg.intercept_}')
        print(f'raw gp_minimize parameters: {fit_result.x}')
        print(f'error value: {fit_result.fun}')
        print(f'BIC (assuming error metric is error variance): {get_bic(fit_result.fun, training_n, len(fit_result.x)+len(optimal_linreg.coef_)+1)}')
    
    # function to generate prediction from trained parameters
    def predict_from_rain(rain_fields, additional_fields=None):
        pred_df = pd.DataFrame(index=rain_fields.index)
        
        for rain_name, rain_params in zip(rain_names, all_rain_params): 
            rain_series = rain_fields[rain_name]
            fraction_retained, first_day_flow, funnel_start_width, time_gap = rain_params
            rain_pred = pd.Series(rainfall_effect(
                rain_series,
                fraction_retained = fraction_retained, 
                first_day_flow = first_day_flow,
                funnel_start_width = funnel_start_width,
                time_gap = time_gap,
            ))
            pred_df[rain_name] = rain_pred
            
        if additional_fields is not None:
            pred_df[list(additional_fields.columns)] = additional_fields
            
        pred_df.dropna(inplace=True)
        return pd.Series(optimal_linreg.predict(pred_df), index=pred_df.index)
            
    return predict_from_rain


# Overview

My approach to this problems aims to balance physics-based methods with numerical(statistical?) methods to arrive at a Machine Learning model which captures important physical characteristics of the system, but still takes a data-driven approach to [confirming/demonstrating/finding evidence for effect of the physical properties] actually forecasting and making predictions.

In my view, the model I describe here has several specific advantages:
- Checks and balances: reasoning about the physics of a system can validate the choice of a particular ML model, and the fit of the ML model can validate the physical assumptions.
- A physically-plausible model of the way rainfall affects a water body, with an emergent distribution that's supported by the data.
    - this model can produce a more nuanced and realistic result than simply generating an exponential-decay feature from the rainfall and using it in a black-box ML model. Additionally, the parameters of this model are tuned as part of the model fit, instead of having to tune the decay factor as part of hyperparameter optimization (or just eyeballing/hand-picking it).
    - it also allows for interpretation of the model parameters 
- No imputation of data needed, except in some cases it may be helpful to impute rain - otherwisse, the method is robust to missing data.
- A way of quantifying uncertainty and simulating specific scenarios (e.g. a dry year)
- Never uses target variables in prediction
- In many cases, can make sensible predictions for up to a year or more
- A rigorous and mathematical method of parameter/model selection using the Bayesian Information Criterion


In [None]:
#  This submission aims to balance physics-based methods with numerical methods by establishing a physics-based relationship model between the fields and then using ML to find the best parameters for the model.

## The Model

Conceptually, my approach is built on one overall model of how the known features of a water body interact. This model is then  adapted for each type of desired output, as well as specific properties of each particular water body.

In [None]:
flow = Digraph(graph_attr={'ranksep':'1'})#, 'rankdir':'LR'})
flow.node('R', 'Rainfall', penwidth='3')
flow.node('T', 'Temperature and/or seasonality', penwidth='2')
flow.node('O', 'Latent factors and\n factors specific to water body', shape='rect', penwidth='2')
#flow.node('O', 'Latent factors \n not captured in dataset', shape='rect', penwidth='2')
flow.node('F', 'Flow rate\n(flowing water body)', shape='octagon')
flow.node('DD', 'Depth and/or \nChange in depth\n(standing water body)', shape='octagon')

flow.edge('R', 'F')
flow.edge('R', 'DD')
flow.edge('O', 'F')
flow.edge('O', 'DD')
flow.edge('T', 'F')
flow.edge('T', 'DD')
flow.edge('F', 'DD',  constraint='false')
flow.edge('DD', 'F',  constraint='false')

flow

The figure above shows the general framework of how different features interact in the model. It is fairly straightforward, though there are a couple of notable points:
- The **Rainfall** tends to be the most important input feature, since it is the main way that water enters the system.
- The model primarily seeks to predict the **change in depth** of a stationary water body, rather than attempting to predict the depth directly. The depth itself can then be derived from the change in depth by a simple cumulative sum operation. This means that the model is always reasoning about how water travels over time, which simplifies it significantly.
- There tends to be a **circular dependency** between the flow rate and the depth (or change in depth). For example, a spring's flow rate depends partially on the amount of water currently present in its source aquifer; but the aquifer's water level also changes due to discharging some of the water through the spring. A lot of the time, it is beneficia to model these two directions separately.
- Temperature and season-based effects are often conflated, and temperature is used as a proxy for seasonality, without necessarily committing to a physical model of that particular effect. This is because temperature-based and season-based physical effects can be hard to reason about without having access to lots of different esoteric data (e.g. humidity, vegetation types and quality, human factors, and any number of other properties of the location where the water body is based). Indeed, it is often easy to find potential justification for a temperature-based effect, then "turn the graph upside down" and find an equally plausible explanation for the opposite effect.



## Rainfall Effect
Because rainfall tends to be the most important feature, one of the main building blocks of the overall model is a model of **the effect of rainfall on a particular water body**, specifically, how the effect of rain which fell on a particular day is distributed over time. In this model, the effect of a single day's rain on some subsequent day $t$ is distributed as 
$\frac{t}{e^t}$:

In [None]:
x = np.arange(0,8,0.1)
y = x/np.exp(x)
plt.plot(x,y)
plt.tick_params(axis='x',labelbottom=False)
plt.tick_params(axis='y',labelleft=False)
plt.show()

This means that the effect has a linear ramp-up and an exponential decay; this kind of ramp-up and decay can frequently be observed in the data when there's a large amount of rain in one day, followed by a dry period.

In practice, this is modeled as the rain coming down some watershed area into the affected water body, and the $\frac{t}{e^t}$ distribution is an **emergent property** of the model (not a hard-coded distribution).  This allows for the model to be based on four physically interpretable parameters, namely:
- fraction of remaining water retained each day (vs. water lost to transpiration, pooling, diversion, etc.)
- the fraction of total rainfall which affects the water body *on the very first day*
- the shape of the watershed "funnel" area, expressed as the "width" of the base of the watershed (compared to the rest of the watershed. Small "width" means that the watershed is more cone-shaped (rain from a wide area collects down to a single point), while large "width"means that the watershed is more cyllinder-shaped (rain tends to go down in a single direction, ending in a wide range of specific locations; e.g. all along some length of a river)
- a "time lag" factor which makes it so that water takes some number of days to get from the base of the watershed to the actual affected water body (e.g. to seep down into an aquifer after falling on the ground)

The fraction of water lost each day creates the exponential decay effect, and the funnel/watershed model creates the linear ramp-up. The funnel shape affects how steep the exponential decay is: the effect is more drawn out with a wider base. The time lag, of course, delays the entire effect.

Machine learning techniques are used to find the best-fitting values for each of these four parameters, for any given water body and dependent variable.

## Workflow
The overall workflow of the model is the same for each water body:
1. Establish dependencies between features and fit model
    - In some cases, it is necessary to establish which dependencies matter based on the model fit. In those cases, the [Baysian Information Criterion](https://en.wikipedia.org/wiki/Bayesian_information_criterion) (BIC) of several models are used to select the most appropriate model.
2. Forecast dependent variable based on trends of the input variables
3. Optionally, estimate uncertainty of the forecast: what is the *range* of possibilities, and how widely can they vary?

I will use the first water body (the Petrignano aquifer) to illustrate this basic process in detail, and then explain variations and elaborations as they come up in other water bodies.

# Aquifers (and detailed model description)

## Petrignano

In [None]:
# Load data
petrignano = pd.read_csv('../input/acea-water-prediction/Aquifer_Petrignano.csv')
fix_date(petrignano)

depth_fields = ['Depth_to_Groundwater_P24',
       'Depth_to_Groundwater_P25']
temp_fields = ['Temperature_Bastia_Umbra',
       'Temperature_Petrignano']

# Drop data that's set to exactly 0 (in cases where this doesn't make sense)
petrignano.loc[petrignano['Volume_C10_Petrignano']==0, 'Volume_C10_Petrignano'] = np.nan
petrignano.loc[petrignano['Hydrometry_Fiume_Chiascio_Petrignano']==0, 'Hydrometry_Fiume_Chiascio_Petrignano'] = np.nan
for t in temp_fields:
    petrignano.loc[petrignano[t]==0, t] = np.nan

# Additional useful features
petrignano['delta_depth'] = petrignano['Depth_to_Groundwater_P24'].diff()

In [None]:
# plot summary
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(5, figsize=(15, 10), sharex = True)
fig.suptitle('Petrignano Data Overview', fontsize=16)

ax1.set_title('Depths')
for d in depth_fields:
    petrignano[d].plot(ax=ax1)

ax2.set_title('Rainfall - rolling average over 120 days')
petrignano['Rainfall_Bastia_Umbra'].rolling(120).mean().plot(ax=ax2)

ax3.set_title('Volume (water withdrawn from well) - rolling average over 30 days')
petrignano['Volume_C10_Petrignano'].rolling(30, center=True).mean().plot(ax=ax3)

ax4.set_title('Temperatures')
for t in temp_fields:
    petrignano[t].plot(ax=ax4)

ax5.set_title('Hydrometry (river flow)')
petrignano['Hydrometry_Fiume_Chiascio_Petrignano'].plot(ax=ax5)

plt.tight_layout()
plt.show()

### 1. Dependencies and Model

The graph below shows the modeled dependencies between features. Dotted lines represent dependencies that were not used in the model, even though they are probably present in real life. Each color represents a separately trained component model for predicting a particular feature from its inputs.


*Side note - why Volume Withdrawn is unused:* Conceptually, the volume of water witdrawn from a well both affects the groundwater depth (it depletes groundwater) and depends on it (the operators may be unwilling and/or unable to withdraw a lot of water if there is not much left). However, in practice, the volume of water withdrawn from well C10 was not used in the final model because it consistently made the model perform worse. Looking at the graphs above, we can see that the volume withdrawn correlates directly with the total depth in both of the other wells: when there is more water in the aquifer, more water tends to be withdrawn, and vice versa. If the water withdrawn had a large effect on groundwater depth, we would expect the opposite correlation: when more water is withdrawn, there is *less* water remaining. Since we don't see this correlation (and since the model doesn't benefit from the volume data), we can conjecture that the water is being withdrawn sustainably from the well so as to not deplete the aquifer, and/or that well C10 is quite far away from the wells with depth data. Indeed, [this paper](https://www.researchgate.net/publication/26812692_The_Sustainable_Pumping_Rate_Concept_Lessons_from_a_Case_Study_in_Central_Italy) suggests that the aquifer was exploited heavily around 1998-2004, but after that presumably efforts were made to use it more sustainably.

In [None]:
petr_graph = Digraph(graph_attr={'ranksep':'1'})
vol_depth = Digraph(graph_attr={'rank':'same'})

petr_graph.node('R', 'Rainfall')
petr_graph.node('T', 'Temperature')
vol_depth.node('D', 'Change in Depth', shape='octagon', color='green')
petr_graph.node('H', 'River Hydrometry', color='blue')
vol_depth.node('V', 'Volume Withdrawn', style='dotted')

petr_graph.edge('R', 'H', color='blue')
petr_graph.edge('R', 'D', color='green')
petr_graph.edge('T', 'D', color='green')
petr_graph.edge('T', 'H', color='blue')
petr_graph.edge('H', 'D', color='green')
vol_depth.edge('D', 'V', style='dotted')
vol_depth.edge('V', 'D', style='dotted')


petr_graph.subgraph(vol_depth)
petr_graph

#### Rainfall + Temperature -> River Hydrometry

The Hydrometry data for the Chiascio river exhibits a clear exponential-decay pattern, with the peaks mapping to days with heavy rainfall. So the Rainfall Effect model makes sense here. 

For illustration purposes, the graph below shows a slice of the hydrometry and raifall data alongside one semi-plausible, hand-picked set of parameters for a rainfall effect prediction. 

I chose the parameters based on observations about the hydrometry data:
- the rain starts taking effect immediately, so there is no time gap
- the rain is falling into a river, so the watershed's "funnel base" should be fairly wide
- the rainfall effect tends to ramp up very quickly an decay over about a month, which dictated the `fraction_retained` and `first_day_flow` parameters.


In [None]:
rain_prediction = pd.Series(rainfall_effect(
    petrignano['Rainfall_Bastia_Umbra'],
    fraction_retained = 0.9, # fraction of water retained each day (vs. fraction that is carried away elsewhere - pooling, transpiration, etc.)
    first_day_flow = 0.002, # what fraction of the total rain takes effect on the first day
    funnel_start_width = 180.0, # when 0, funnel is cone-shaped; when large, funnel is closer to cyllinder-shaped.
    time_gap = 0, # integer(days) - how long does it take even the first water to reach the area of interest
))

In [None]:
example_slice = petrignano.loc[datetime.date(2018,6,1):]
fig, ax1 = plt.subplots( figsize=(15, 7))
ax2 = ax1.twinx()
ax2.tick_params(axis='y',labelright=False)
ax3 = ax1.twinx()

h = ax1.plot(example_slice.index,example_slice['Hydrometry_Fiume_Chiascio_Petrignano'], color='green')
r = ax2.bar(example_slice.index, example_slice['Rainfall_Bastia_Umbra'])
p = ax3.plot(rain_prediction.index, rain_prediction, color='red', linestyle='dashed')

with warnings.catch_warnings(): # "mixed positional and keyword arguments" warning, but matplotlib only allows this kind of usage
    warnings.simplefilter("ignore")
    fig.legend([h,r,p], labels=['Hydrometry_Fiume_Chiascio_Petrignano', 'Rainfall_Bastia_Umbra', 'rainfall effect prediction (manual parameter guess)'])
plt.xlim(datetime.date(2018,6,1), datetime.date(2019,7,1))
plt.show()

We can see from the graph above that the general shape of the prediction matches the general shape of the hydrometry data pretty well, though there are places where the rainfall-based prediction produces spikes that are too large or too small compared to the actual effect. This illustrates that single-point rainfall data can sometimes lack the precision necessary to make extremly accurate predictions about the effect of rain on an entire area. However, in practice, the model tends to be pretty robust to the resulting noise, and the fit can be quite good anyway. 

**Note** that the rainfall effect prediction and the hydrometry data are shown on separate axes, because they are being measured in different units. Converting from millimeters of rain to the output units is **not** part of the Rainfall Effect model. Instead, the conversion parameters will be found using linear regression *after* a candidate Rainfall Effect output is calculted. Fitting parameters of a complex model is expensive, but scaling with linear regression is cheap. In fact, we can see that even matplotlib's default strategy for scaling each axis is enough to be able to compare the data sets without explicitly finding conversion factors.


Fitting the parameters of the Rainfall Effect model is achieved through [Bayesian Optimization](https://en.wikipedia.org/wiki/Bayesian_optimization). This is a technique which minimizes a potentially expensive-to-evaluate function with unknown properties (in our case, the error resulting from a given set of Rainfall Effect parameters). It does this by evaluating the function at strategic points, in order to gain the most information to refine the posterior belief about the function overall. 

One benefit of this technique, and the Bayesian approach to probability in general, is that it's possible (and in fact, necessary) to specify some prior beliefs about the input parameters. These priors will be used as a starting point for the model. Good priors will make the model converge better and faster. A bad set of priors can slow down the fitting processs, but will not *prevent* the model from trying a particular value unless we explicitly assume that this value is impossible (e.g. negative rainfall doesn't make sense).

For this model, I wrote a wrapper around [skopt's gp_minimize](https://scikit-optimize.github.io/stable/modules/generated/skopt.gp_minimize.html) function in order to be able to specify bell-shaped priors: the mean of the distribution is then my best guess of what the parameter should be, and a bell "width" parameter controls how sure I am (narrower = more sure). These priors generally remain at one default setting, but their shape allows for more effective optimization. In particular, the default skopt behavior is to specify hard limits on the possible values of a parameter. But a bell-shaped function can theoretically extend to infinity (though there are in practice numeric limitations), while still keeping the region of *likely* priors manageable.

At each step of the optimization, given a set of parameters suggested by gp_minimize, the wrapper:
- computes the rain effect with those parameterss
- uses linear regression to scale these rain effects, *plus a set of optional linear-effect features*, to the target variable
- computes the error metric and returns it back to gp_optmize

The default error metric used is **error variance**. Optimizing for error variance is useful in a number of ways:
- Error variance is needed for [BIC](https://en.wikipedia.org/wiki/Bayesian_information_criterion)-based model selection, so optimizing for it helps ensure the BIC accurately represents the model quality. 
- Error variance is useful for quantifying uncertainty on the training data set, which in turn can be used to estimate uncertainty on the forecast data.
- This metric is very closely related to the Mean Squared Error, so optimizing for it will also optimize the RMSE, which is one of the requested metrics in the data challenge.  

In [None]:
last_date = petrignano.iloc[-1].name 
num_days_in_test = 365
train_cutoff = last_date - datetime.timedelta(num_days_in_test-1)
print(f'Splitting train and test data on: {train_cutoff}')
print(f'({num_days_in_test} days in test dataset)')

train_data = petrignano.loc[:train_cutoff].copy()
test_data = petrignano.loc[train_cutoff:]

# For fitting hydrometry, take only relevant columns and drop all rows with any nulls
# This makes the BIC comparisons more apples-to-apples, because then N (number of input data points) is the same across models,
train_hydrometry_data = train_data[['Rainfall_Bastia_Umbra','Temperature_Bastia_Umbra','Temperature_Petrignano', 'Hydrometry_Fiume_Chiascio_Petrignano']].dropna().copy()

In [None]:
print('Fitting hydrometry from rainfall alone:')
pred_func=fit_rain_effects(
    rains_df = train_hydrometry_data[['Rainfall_Bastia_Umbra']], 
    ground_truth = train_hydrometry_data['Hydrometry_Fiume_Chiascio_Petrignano'], 
    # limit the number of iterations so that notebook runtime is reasonable
    # (this number of iterations is not actually practical without the x0 hint below)
    nrandom=3,
    ntotal=10,
    verbose=False, # switch this to True(default) to see the results for each iteration
    # x0 gives hints about good input values directly to gp_minimize 
    # (these are from previous training rounds, usually generated with the same training data set)
    # This is unlikely to cause train-test leaks even in cases where the underlying data sets were different, 
    # since the quality(error metric) of the hint is calculated using the current training data,
    # So the hint is accepted or rejected based solely on how well the parameters fit the training data.
    # If the suggested parameters do have a good fit with the training data, they would have been found eventually 
    # throgh the optimization process, given enough time.
    # If they don't, then they will be rejected and better values will befound (also given enough time).
    # Anyway, in order to properly refit the model to some other input (e.g. a different train/test split),
    # you can get rid of the x0 parameter  and change nrandom and ntotal to something large, e.g. 20 and 300.
    # This may take a while, I suggest turning on the verbosity.
    x0=[
        [0.5526420950379467, 0.47164771912757525, 0.9189958623656255, 0],
        [0.5539522165888526, 0.4506610850117819, 0.9376069254850813, 0],
    ],
)

We can interpret the generated model parameters above as follows:

The model predicts how rain which fell on some watershed area, which can be accurately captured by the mm of rain measurement taken from an instrument at Bastia Umbra, flows down that watershed and into the Chiascio river.

- *0.9634054888302401*: about 96.3% of the remaining water is preserved each day. In other words, about 3.7% of the water that fell as rainfall gets dissipated or diverted each day as it travels toward the river.
- *0.0043234544645566145*: The first day after a rainfall, about 0.4% of the total water reaches the river
- *277.15125538997574*: The "base" of the watershed - the area from which the rain reaches the river in the first day - measures 277 of some abstract area unit; each subsequent "level" of the watershed - the area from which the rain takes `n` days to reach the river - has area `277+2*n` (*This abstract area unit can and does change with each run of the simulation; it only captures the relationships between the levels, and how the total watershed area is distributed among the levels*) 
- *0*: There is no time lag - the water from the rain starts reaching the river immediately.
- *Scaling factor 1.974471914324171*: conversion factor: if the total volumen of water which falls on the entire watershed area to generate a 1 mm reading was instantly dumped into the river, it would increase the river level by about 1.97 m in that moment 
- *Translation parameter 2.0381578930345676*:  about 2 meters of the river level can be considered a "pre-existing baseline", in that they are not generated by the rainfall in this particular watershed area.

The graph below shows the generated model predictions (note that unlike my manual guess, the generated predictions *are* in fact plotted on the same axis as the ground truth data, since they have been rescaled as part of the model):

In [None]:
example_slice = petrignano.loc[datetime.date(2018,6,1):]
rain_prediction = pred_func(train_hydrometry_data[['Rainfall_Bastia_Umbra']])

fig, ax1 = plt.subplots( figsize=(15, 7))
ax2 = ax1.twinx()
ax2.tick_params(axis='y',labelright=False)
#ax3 = ax1.twinx()

h = ax1.plot(example_slice.index,example_slice['Hydrometry_Fiume_Chiascio_Petrignano'], color='green')
r = ax2.bar(example_slice.index, example_slice['Rainfall_Bastia_Umbra'])
p = ax1.plot(rain_prediction.index, rain_prediction, color='red', linestyle='dashed')

with warnings.catch_warnings(): # "mixed positional and keyword arguments" warning, but matplotlib only allows this kind of usage
    warnings.simplefilter("ignore")
    fig.legend([h,r,p], labels=['Hydrometry_Fiume_Chiascio_Petrignano', 'Rainfall_Bastia_Umbra', 'rainfall effect prediction (parameters fit with Bayesian optimization)'])
plt.xlim(datetime.date(2018,6,1), datetime.date(2019,7,1))
plt.show()

The generated parameters seem to produce a decent fit, thoug it's possible that the model underestimates how quickly the effect of the rain should decay; and the magnitude of the rain-related spikes is not alwaays right.

Of course, the quality of the model fit doesn't immediately *guarantee* that the physically interpretable parameters are close to the physical reality. In fact, the same (or extremely similar) exponential-decay rainfall effect curves can often be generated with sets of signinficantly different parameters. So there can be several physical explanations to the same observed effect.

In particular, the **scaling factor**  - how millimeters of rain translate to meters of river level - can vary widely between candidate solutions with similar fit. That scaling factor implicitly defines the *area* of the watershed from which the rain in question enters the river - a millimeter of rain falling across one squared meter is a very different amount of water than a millimeter of rain falling across a squared kilometer area. But the watershed area is not only completely unknown, but also ambiguous (which rain can we count as "close enough" to the measurement point to be part of "the same" watershed area?..)

On the other hand, the **translation parameter** stays quite similar throughout the model fit process. (we can observe this if we turn on verbosity in the model so that the intermediate results are shown). So we can be pretty sure about the amount of water in the river that's not accounted for solely by the rainfall.

But temperature and/or seasonality may also have an effect on the river's water level. In fact, as we can see from the graph below, there seems to be some correlation between temperature and the prediction error of the rainfall-only model. (I am graphing just one of the available temperature variables, since they both look about the same.)

In this case, the correlation implies that when temperatures are lower, the prediction tends to be too low: the actual river level is higher than the level predicted from rain alone. As I mentioned in the overview, there could be any number of plausible but conflicting explanations for how temperature (and, by proxy, seasons) affect the various water bodies. For example, one explanation could be that when the ground is cold, it tends to absorb less water, and therefore less water is lost, and/or the water is less delayed as it comes down the watershed. Similarly, it could be that in warmer weather, there tend to be many more plants which absorb the water as it makes its way down, and so more water is lost to transpiration. Alternatively, perhaps the river simply has a different "baseline flow" in different seasons.

In [None]:
error = pred_func(train_data[['Rainfall_Bastia_Umbra']]) - train_data['Hydrometry_Fiume_Chiascio_Petrignano'] 
fig, ax1 = plt.subplots( figsize=(15, 5))
ax2 = ax1.twinx()

e = ax1.plot(error.index, error)
t = ax2.plot(train_data.index,train_data['Temperature_Petrignano'], color='red')
# t = ax2.plot(train_data.index,train_data['Temperature_Petrignano'], color='orange')

with warnings.catch_warnings(): # "mixed positional and keyword arguments" warning, but matplotlib only allows this kind of usage
    warnings.simplefilter("ignore")
    fig.legend([e,t], labels=['Prediction error(rainfall only model)', 'Temperature_Bastia_Umbra'])

Since temperature-based variations are hard to explain conclusively, we can just use a simple model to try to take the effect into account without trying to model an explanation which can be arbitrarily far from reality. 

The simplest thing to do is to just incorporate the temperature data as an additional parameter in the linear regresssion portion of the model fit. Additionally, since most plausible explanations about seasonality effects involve reasoning about whether it *has been* warm or cold *recently* (and thus whether the ground has gotten cold, or the plants have sprouted, etc.), I use a **rolling average** of the temperature over the past 30 days. This also helps  smooth out some of the noise, which can avoid overfitting.

30 days was chosen arbitrarily, by eyeballing some graphs. In principle, given unlimited time (for both development and training the model), one could also incorporate this parameter into the bayesian optimization flow and find the best fit. But the improvement would likely be marginal.

In [None]:
print('Fitting hydrometry from rainfall and one temperature feature:')
pred_func_onerain=fit_rain_effects(
    rains_df = train_hydrometry_data[['Rainfall_Bastia_Umbra']], 
    ground_truth = train_hydrometry_data['Hydrometry_Fiume_Chiascio_Petrignano'], 
    additional_fields = train_hydrometry_data[['Temperature_Bastia_Umbra']].rolling(30).mean(),
    nrandom=3,
    ntotal=10,
    verbose=False,
    x0=[
        [0.5526420950379467, 0.47164771912757525, 0.9189958623656255, 0],
        [0.5539522165888526, 0.4506610850117819, 0.9376069254850813, 0],
    ],
)
# -10759.23167816104

As expected, the prediction is better when we incorporate temperature into the model! The error has gone down from about 0.074 to about 0.05. But whenever the fit of a model improves due to increasing the model's complexity, it's natural to ask: are we overfitting? 

This is a question which can be answered using the Bayesian information criterion (BIC). The BIC quantifies the tradeoff between comlexity and fit. Specifically, it uses **the likelihood of seeing the data actually observed , if the model was correct** to quantify a model's fit, and the **number of parameters the model has** to quantify its complexity. (*Side note: since neural network-based and boosting-based ML methods use **tons** of parameters - hundreds or thousands - it can be harder to apply BIC in those cases. That is why cross validation tends to be a more popular method for model selection in those cases, though it has its own imprecisions and pitfalls. On the other hand, with BIC, we can reason about model selection and overfitting by **only** examining the training data.*)

There's a slight problem with trying to apply the BIC to just any model: Since the BIC reasons about the *likelihood* of seeing a particular outcome given the model, the underlying asssumption is that the model can tell us the likelihood of something. In other words, the model implies a *probability distribution* of observing particular outcomes. But our model just predicts the outcome it thinks is most likely, and doesn't commit to any probability distribution which would tell us how  likely it actually is. 

The fix for this is to make a reasonable assumption about what the probability distribution would be, given how the model prediction's errors  are distributed. Namely, if we assume that the errors are iid (independent and identically distributed) and have a normal distribution, the BIC can be calculated as:
$$
n\ln(v)+k\ln(n)
$$

where $v$ is the error variance of our model - the very metric we were optimizing for!

This is, in fact, the BIC that is output after fitting the model. Smaller BIC (i.e. negative BIC with larger absolute value) is better, so **comparing the BICs suggests that the model *with* temperature is better**: the BIC is smaller, so the gains in fit are  justified by the additional complexity.

(the graph below suggests that the normally-distributed error assumption is justified. The iid assumption is harder to justify with just graphs.)

In [None]:
error = pred_func_onerain(train_data[['Rainfall_Bastia_Umbra']], train_data[['Temperature_Bastia_Umbra']].rolling(30).mean()) - train_data['Hydrometry_Fiume_Chiascio_Petrignano'] 
error.hist(bins=30)
plt.title('error distribution(looks normal-ish)')
plt.show()

But wait, we actually have two temperature features available. Would it make sense to try to use both of them? Or would it be redundant, since the two temperatures match each other quite closely?

We can use the BIC to answer that, too:

In [None]:
print('Fitting hydrometry from rainfall and both temperature features:')
pred_func_twotemp=fit_rain_effects(
    rains_df = train_hydrometry_data[['Rainfall_Bastia_Umbra']], 
    ground_truth = train_hydrometry_data['Hydrometry_Fiume_Chiascio_Petrignano'], 
    additional_fields = train_hydrometry_data[['Temperature_Bastia_Umbra', 'Temperature_Petrignano']].rolling(30).mean(),
    nrandom=3,
    ntotal=10,
    verbose=False,
    x0=[
        [0.5526420950379467, 0.47164771912757525, 0.9189958623656255, 0],
        [0.5539522165888526, 0.4506610850117819, 0.9376069254850813, 0],
    ],
)

The fit does improve very slightly, but the BIC suggests that the improvement is not justified by the increased complexity. So, we will use `Rainfall_Bastia_Umbra` and `Temperature_Bastia_Umbra` as inputs to predict `Hydrometry_Fiume_Chiascio_Petrignano`.

(*Why `Temperature_Bastia_Umbra` and not `Temperature_Petrignano`? It turns out it doesn't matter much which one to use. The resulting error and BIC are actually identical, but I omitted the code verifying this for brevity and clarity.*)

The predictions of the final river flow model are shown below. In principle, it may be possible to add further parameters, for example, to try to account for the occasional really large spikes by adding a second copy of the rain data (physically, this would represent a primary and secondary effect from the same rain falling on the same area: some rain rolls down the ground surface almost immediately, while other rain seeps into the ground and then rolls down more slowly). I tried this out, but it did not improve the BIC. Regardless, the fit of this model will work for our purposes.

In [None]:
example_slice = petrignano.loc[datetime.date(2018,6,1):]
rain_prediction = pred_func_onerain(train_data[['Rainfall_Bastia_Umbra']], train_data[['Temperature_Petrignano']].rolling(30).mean()) 

fig, ax1 = plt.subplots( figsize=(15, 7))
ax2 = ax1.twinx()
ax2.tick_params(axis='y',labelright=False)
# ax3 = ax1.twinx()

h = ax1.plot(example_slice.index,example_slice['Hydrometry_Fiume_Chiascio_Petrignano'], color='green')
r = ax2.bar(example_slice.index, example_slice['Rainfall_Bastia_Umbra'])
p = ax1.plot(rain_prediction.index, rain_prediction, color='red', linestyle='dashed')

with warnings.catch_warnings(): # "mixed positional and keyword arguments" warning, but matplotlib only allows this kind of usage
    warnings.simplefilter("ignore")
    fig.legend([h,r,p], labels=['Hydrometry_Fiume_Chiascio_Petrignano', 'Rainfall_Bastia_Umbra', 'rainfall effect prediction (parameters fit with Bayesian optimization)'])
plt.xlim(datetime.date(2018,6,1), datetime.date(2019,7,1))
plt.show()

#### River Hydrometry + Rainfall + Temperature -> (change in) Depth to Groundwater

Separately from the hydrometry model, we will train models to predict the daily change in depth-to-groundwater based on the river flow, rainfall, and temperature.

Conceptually, we will try to model that:
- depth *increases* in proportion with the river flow, because the river brings in water (this phenomenon is confirmed in [this paper](https://www.researchgate.net/publication/26812692_The_Sustainable_Pumping_Rate_Concept_Lessons_from_a_Case_Study_in_Central_Italy)) 
- depth also *increases* when rain falls (with the rain taking effect as in the Rainfall Effect model)
- depth *decreases* at a constant rate, as water seeps out, leaves with the river on the other end of the aquifer, evaporates, etc.
- depth may be affected by temperature and/or seasonality, e.g. because the characteristics of earth in the aquifer change with temperature.

Below is the fit of the standard rain-effect model with all the parameters included.

In [None]:
train_depth_data = train_data[[
    'Rainfall_Bastia_Umbra',
    'Temperature_Bastia_Umbra','Temperature_Petrignano', 
    'Hydrometry_Fiume_Chiascio_Petrignano', 
    'Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25'
]].copy()

for d in depth_fields:
    train_depth_data['delta_'+d] = train_depth_data[d].diff() # delta depth: current depth minus previous day's depth
    
for t in temp_fields:
    train_depth_data[t+'_rolling'] = train_depth_data[t].rolling(30).mean()
    
train_depth_data = train_depth_data.dropna()

In [None]:
print('Fitting change-in-depth from rainfall, hydrometry, and temperature:')
pred_depth_func=fit_rain_effects(
    rains_df = train_depth_data[['Rainfall_Bastia_Umbra']], 
    ground_truth = train_depth_data['delta_Depth_to_Groundwater_P24'], 
    additional_fields = train_depth_data[['Temperature_Bastia_Umbra_rolling', 'Temperature_Petrignano_rolling', 'Hydrometry_Fiume_Chiascio_Petrignano']],
    nrandom=3,
    ntotal=10,
    verbose=False,
    x0=[
        [0.0474258731775668, 0.6367973783355014, 0.9079965422001715, 0],
        [0.0474258731775668, 0.9991795384988705, 0.17941634117347088, 0],
    ]
)

Interestingly, the model prefers to minimize the amount of rain water retained from day-to-day as the water is on its way down some watershed into the aquifer (the first input parameter for the Rainfall_Bastia_Umbra effect model). This essentially means that when rain falls, **all rainwater either enters the aquifer or dissipates elsewhere on the very first day**. In other words, all rainfall primarily takes effect on the very same day as it falls, proportional to the amount of rain that fell. But that's just a linear effect which we could model directly with linear regression, instead of using the complex but more general rainfall effect model:

In [None]:
print(f'Fitting delta-depth using linear regression(all inputs)')

linreg_X = train_depth_data[[
    'Rainfall_Bastia_Umbra',
    'Temperature_Bastia_Umbra_rolling', 'Temperature_Petrignano_rolling',
    'Hydrometry_Fiume_Chiascio_Petrignano']]

linreg_delta_depth = LinearRegression().fit(linreg_X, train_depth_data['delta_Depth_to_Groundwater_P24'])

linreg_pred = pd.Series(linreg_delta_depth.predict(linreg_X), index=train_depth_data.index)
linreg_error_var = (linreg_pred-train_depth_data['delta_Depth_to_Groundwater_P24']).var()
linreg_bic = get_bic(linreg_error_var, len(train_depth_data), 4)
print(f'error variance: {linreg_error_var}')
print(f'BIC: {linreg_bic}')

We can see that with linear regresssion, the error variance is a tiny bit worse, but the BIC is a bit better. This is because the linear model is less complex: it only fits one additional parameter to incorporate the effect of rain, while the Rainfall Effect model needs 4 additional parameters.

Removing one of the temperature variables again makes the error metric slightly worse, but the BIC slightly better:

In [None]:
print(f'Fitting delta-depth using linear regression (just one temperature)')

linreg_X = train_depth_data[[
    'Rainfall_Bastia_Umbra',
    'Temperature_Bastia_Umbra_rolling',
    'Hydrometry_Fiume_Chiascio_Petrignano']]

linreg_delta_depth = LinearRegression().fit(linreg_X, train_depth_data['delta_Depth_to_Groundwater_P24'])

linreg_pred = pd.Series(linreg_delta_depth.predict(linreg_X), index=train_depth_data.index)
linreg_error_var = (linreg_pred-train_depth_data['delta_Depth_to_Groundwater_P24']).var()
linreg_bic = get_bic(linreg_error_var, len(train_depth_data), 3)
print(f'error variance: {linreg_error_var}')
print(f'BIC: {linreg_bic}')

Plain linear regression is also MUCH faster to fit, so it's a lot easier to work with. Actually, with such a fast-to-fit model, we can revisit the use of average temperature over 30 days by trying a series of feasible values for the temperature average:

In [None]:
errvars = []
for temp_rolling in range(1,60):
    train_depth_data = train_data[[
        'Rainfall_Bastia_Umbra',
        'Temperature_Bastia_Umbra','Temperature_Petrignano', 
        'Hydrometry_Fiume_Chiascio_Petrignano', 
        'Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25'
    ]].copy()

    for d in depth_fields:
        train_depth_data['delta_'+d] = train_depth_data[d].diff() # delta depth: current depth minus previous day's depth

    for t in temp_fields:
        train_depth_data[t+'_rolling'] = train_depth_data[t].rolling(temp_rolling).mean()

    train_depth_data = train_depth_data.dropna()

    linreg_X = train_depth_data[[
        'Rainfall_Bastia_Umbra',
        'Temperature_Bastia_Umbra_rolling',
        'Hydrometry_Fiume_Chiascio_Petrignano']]

    linreg_delta_depth = LinearRegression().fit(linreg_X, train_depth_data['delta_Depth_to_Groundwater_P24'])

    linreg_pred = pd.Series(linreg_delta_depth.predict(linreg_X), index=train_depth_data.index)
    linreg_error_var = (linreg_pred-train_depth_data['delta_Depth_to_Groundwater_P24']).var()
    errvars.append(linreg_error_var)
    
plt.plot(range(1,60),errvars)
plt.xlabel('average temperature over n days')
plt.ylabel('error variance')
plt.show()

It appears that at least for the aquifer depth, using near-instant temperature is actually a better idea than the 30-day average. So, let's adjust our model a final time. Technically, this means that we have now added one more parameter to the model for BIC calculation purposes, since we got our new rolling average width from fitting the model to the data.

(I will keep the 30 day rolling average for the river flow prediction, since the mechanism of the effect of temperature could well be completely different for that model)

In [None]:
print('delta_Depth_to_Groundwater_P24')
print('fitting new version of linear regression with a smaller rolling temperature span')
train_depth_data = train_data[[
    'Rainfall_Bastia_Umbra',
    'Temperature_Bastia_Umbra','Temperature_Petrignano', 
    'Hydrometry_Fiume_Chiascio_Petrignano', 
    'Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25'
]].copy()

for d in depth_fields:
    train_depth_data['delta_'+d] = train_depth_data[d].diff() # delta depth: current depth minus previous day's depth

for t in temp_fields:
    train_depth_data[t+'_rolling'] = train_depth_data[t].rolling(2).mean()

train_depth_data = train_depth_data.dropna()

linreg_X = train_depth_data[[
    'Rainfall_Bastia_Umbra',
    'Temperature_Bastia_Umbra_rolling',
    'Hydrometry_Fiume_Chiascio_Petrignano']]

linreg_delta_depth_p24 = LinearRegression().fit(linreg_X, train_depth_data['delta_Depth_to_Groundwater_P24'])

linreg_pred = pd.Series(linreg_delta_depth_p24.predict(linreg_X), index=train_depth_data.index, name='delta depth prediction')
linreg_error_var = (linreg_pred-train_depth_data['delta_Depth_to_Groundwater_P24']).var()
linreg_bic = get_bic(linreg_error_var, len(train_depth_data), 4)
print(f'error variance: {linreg_error_var}')
print(f'BIC: {linreg_bic}')

In [None]:
plt.subplots(figsize = (15,5))
linreg_pred.rolling(30).mean().plot()
train_depth_data['delta_Depth_to_Groundwater_P24'].rolling(30).mean().plot()
plt.legend()
plt.title('real vs. predicted delta_Depth_to_Groundwater_P24 (rolling mean, 30 days)')
plt.show()

In [None]:
print('Linear regression for delta_Depth_to_Groundwater_P25')

linreg_X = train_depth_data[[
    'Rainfall_Bastia_Umbra',
    'Temperature_Bastia_Umbra_rolling',
    'Hydrometry_Fiume_Chiascio_Petrignano']]

linreg_delta_depth_p25 = LinearRegression().fit(linreg_X, train_depth_data['delta_Depth_to_Groundwater_P25'])

linreg_pred = pd.Series(linreg_delta_depth_p25.predict(linreg_X), index=train_depth_data.index, name='delta depth prediction')
linreg_error_var = (linreg_pred-train_depth_data['delta_Depth_to_Groundwater_P24']).var()
linreg_bic = get_bic(linreg_error_var, len(train_depth_data), 4)
print(f'error variance: {linreg_error_var}')
print(f'BIC: {linreg_bic}')

plt.subplots(figsize = (15,5))
linreg_pred.rolling(30).mean().plot()
train_depth_data['delta_Depth_to_Groundwater_P25'].rolling(30).mean().plot()
plt.legend()
plt.title('real vs. predicted delta_Depth_to_Groundwater_P25 (rolling mean, 30 days)')
plt.show()

Since the depth of the two wells is extremely similar, I have not gone through a separate model selection process for the second well, though I did of course refit the model to the actual data.

The graphs above show the predicted and actual delta-depth for the two wells. I graphed the rolling average over 30 days, because otherwise the data is too noisy to read.

For both wells, we can see that the real data has larger fluctuations, but the model still captures the trend pretty well. Some of these fluctuations can actually be captured by incorporating the data about volume pumped from well C10. However,that would mean that we have to forecast this volume data as part of the forecasting, and I did not want to attemt that kind of logic just for a small potential improvement in the fit. This could be an area for further research.

### 2. Forecasting

Now we have models that explain how change-in-depth can be predicted from rainfall and temperature data (via the intermediate prediction of river flow). But we need to forecast the depth of aquifer **without knowing what the rainfall and temperature will be in the future**.

To do this, I use a very simple model of each input variable: for each day of the year, I aggregate the weekly averages of what the rainfall and temperature was like on that day. This gives a general expectation of what the rain and temperature might be like on that day. I then use these expected values as input to the model to generate a forecast for any number of days in the year.

In [None]:
train_data['week_Rainfall_Bastia_Umbra'] = train_data['Rainfall_Bastia_Umbra'].rolling(7, center=True).mean()
train_data['week_Temperature_Bastia_Umbra'] = train_data['Temperature_Bastia_Umbra'].rolling(7, center=True).mean()

expectation_data = {
    'week_Rainfall_Bastia_Umbra':[],
    'week_Temperature_Bastia_Umbra':[],
}
var_data = {
    'week_Rainfall_Bastia_Umbra':[],
    'week_Temperature_Bastia_Umbra':[],
}
for field in expectation_data.keys():
    for d in range(365):
        nearby_days = np.arange(d-3, d+3)%365+1
        near_data = train_data[train_data['dayofyear'].isin(nearby_days)]
        expectation_data[field].append(near_data[field].mean())
        var_data[field].append(near_data[field].var())
        
        
fig, axes = plt.subplots(2, figsize=(10,10))
for field, ax in zip(expectation_data.keys(), axes):
    ax.plot(range(365), expectation_data[field])
    ax.set_title(f'Expected {field} on given day of year')
plt.tight_layout()
plt.show()

In order to make accurate predictions into the future, we combine known rain/temp data from the past with the expected values from the future into one series.

In particular, this allows us to incorporate the effect of recent rains into the predictions for the immediate future, instead of starting with the assumtion that it hasn't rained before the prediction period.

One interesting nuance is that even though the expected rainfall and temerature will be the same on a given day of year, regardless of the year itself, the model may well evolve in a way that predicts different values on those days, since the output of the model depends on the cumulative past, not just the current day's input.

In [None]:
petrignano_with_exp = petrignano.join(pd.DataFrame(expectation_data['week_Rainfall_Bastia_Umbra'], index=range(1, 366), columns=['rain_for_pred']), on='dayofyear')
petrignano_with_exp['rain_for_pred'] = petrignano_with_exp['rain_for_pred'].where(petrignano.index >= train_cutoff, petrignano['Rainfall_Bastia_Umbra'])

petrignano_with_exp = petrignano_with_exp.join(pd.DataFrame(expectation_data['week_Temperature_Bastia_Umbra'], index=range(1, 366), columns=['temp_for_pred']), on='dayofyear')
petrignano_with_exp['temp_for_pred'] = petrignano_with_exp['temp_for_pred'].where(petrignano.index >= train_cutoff, petrignano['Temperature_Bastia_Umbra'])

fig, (ax1,ax2)  = plt.subplots(2, figsize=(15,6), sharex=True)
ax1.set_title('actual(training data set) and expected(test data set) rainfall')
petrignano_with_exp.loc[:train_cutoff,'rain_for_pred'].plot(ax=ax1)
petrignano_with_exp.loc[train_cutoff:,'rain_for_pred'].plot(ax=ax1)
ax2.set_title('actual(training data set) and expected(test data set) temperature')
petrignano_with_exp.loc[:train_cutoff,'temp_for_pred'].plot(ax=ax2)
petrignano_with_exp.loc[train_cutoff:,'temp_for_pred'].plot(ax=ax2)
plt.xlim(datetime.date(2010,1,1), None)
plt.show()

In [None]:
river_prediction_input = petrignano_with_exp[['rain_for_pred', 'temp_for_pred']].rename(columns={
    'rain_for_pred':'Rainfall_Bastia_Umbra',
    'temp_for_pred':'Temperature_Bastia_Umbra'
})
river_prediction = pred_func_onerain(river_prediction_input[['Rainfall_Bastia_Umbra']], river_prediction_input[['Temperature_Bastia_Umbra']].rolling(30).mean())

petrignano_with_exp['hydrometry_for_pred'] = river_prediction

plt.subplots(figsize=(15,5))
petrignano['Hydrometry_Fiume_Chiascio_Petrignano'].plot()
river_prediction[:train_cutoff].plot(label='predicted hydrometry (training input)')
river_prediction[train_cutoff:].plot(label='predicted hydrometry (test input - expected values)')
plt.legend()
plt.title('Intermediate step: Hydrometry forecasting')
plt.xlim(datetime.date(2018,7,1), None)
plt.show()

We can see that the river flow prediction matches reality fairly reasonably, even though we used the expected values of rain and temperature as input parameters.

In [None]:
depth_prediction_input = petrignano_with_exp[['rain_for_pred', 'temp_for_pred','hydrometry_for_pred']].rename(columns={
    'rain_for_pred': 'Rainfall_Bastia_Umbra',
    'temp_for_pred': 'Temperature_Bastia_Umbra',
    'hydrometry_for_pred': 'Hydrometry_Fiume_Chiascio_Petrignano',
})
# reset the hydrometry input to use actual training data instead of predictions based on training data (TODO: maybe not?.. maybe variance)
#depth_prediction_input['Hydrometry_Fiume_Chiascio_Petrignano'] = depth_prediction_input['Hydrometry_Fiume_Chiascio_Petrignano'].where(depth_prediction_input.index >= train_cutoff,petrignano['Hydrometry_Fiume_Chiascio_Petrignano'] )

# predict both delta-depths
delta_depth_p24 = pd.Series(linreg_delta_depth_p24.predict(depth_prediction_input.dropna()), index=depth_prediction_input.dropna().index)
delta_depth_p25 = pd.Series(linreg_delta_depth_p25.predict(depth_prediction_input.dropna()), index=depth_prediction_input.dropna().index)

# get depth predictions from last known depth in the training data and sequence of delta-depth predictions
last_train_depth24 = train_data.iloc[-1]['Depth_to_Groundwater_P24']
depth_pred_p24 = delta_depth_p24[train_cutoff:].cumsum()+last_train_depth24
last_train_depth25 = train_data.iloc[-1]['Depth_to_Groundwater_P25']
depth_pred_p25 = delta_depth_p25[train_cutoff:].cumsum()+last_train_depth25

fig, (ax1, ax2) = plt.subplots(2,figsize=(15,5), sharex=True)
petrignano.loc[datetime.date(2018,7,1):,'Depth_to_Groundwater_P24'].plot(ax=ax1)
depth_pred_p24.plot(ax=ax1, label='prediction')
ax1.set_title('Depth_to_Groundwater_P24')
ax1.legend()
petrignano.loc[datetime.date(2018,7,1):,'Depth_to_Groundwater_P25'].plot(ax=ax2)
depth_pred_p25.plot(ax=ax2, label='prediction')
ax2.set_title('Depth_to_Groundwater_P25')
ax2.legend()
plt.xlim(datetime.date(2018,7,1), None)
plt.show()

We can see that the predictions follow the general trend of the ground truth quite closely, and don't deviate even after a year. The model cannot predict fluctuations in depth that happen on a small timescale. Based on our model of how depth depends on the input fields, we know that these fluctuations are very likely the result of very recent inputs into the system, such as how much it rained last week, or how much water was withdrawn in the immediate past. Based on that, I would not expect to be able to predict such fluctuations accurately in the long term regardless of the underlying model.

The requested error metrics(on a daily timescale) are shown below.

In [None]:
#mean_absolute_error(petrignano.loc[train_cutoff:,'Depth_to_Groundwater_P24'],depth_pred_p24)
#mean_squared_error, 
error_calc_df = petrignano.loc[train_cutoff:,['Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25']].copy()
error_calc_df['pred_24'] = depth_pred_p24
error_calc_df['pred_25'] = depth_pred_p25
error_calc_df.dropna(inplace=True)
mae_24 = mean_absolute_error(error_calc_df['Depth_to_Groundwater_P24'],error_calc_df['pred_24'])
rmse_24 = mean_squared_error(error_calc_df['Depth_to_Groundwater_P24'],error_calc_df['pred_24'], squared=False)
mae_25 = mean_absolute_error(error_calc_df['Depth_to_Groundwater_P25'],error_calc_df['pred_25'])
rmse_25 = mean_squared_error(error_calc_df['Depth_to_Groundwater_P25'],error_calc_df['pred_25'], squared=False)
print('Error metrics for predictions (daily)\n')
print('Depth_to_Groundwater_P24:')
print(f'  MAE:{mae_24}')
print(f'  RMSE:{rmse_24}')
print('Depth_to_Groundwater_P25:')
print(f'  MAE:{mae_25}')
print(f'  RMSE:{rmse_25}')


In [None]:
monthly_pred = error_calc_df.resample('M').mean()
mae_24 = mean_absolute_error(monthly_pred['Depth_to_Groundwater_P24'],monthly_pred['pred_24'])
rmse_24 = mean_squared_error(monthly_pred['Depth_to_Groundwater_P24'],monthly_pred['pred_24'], squared=False)
mae_25 = mean_absolute_error(monthly_pred['Depth_to_Groundwater_P25'],monthly_pred['pred_25'])
rmse_25 = mean_squared_error(monthly_pred['Depth_to_Groundwater_P25'],monthly_pred['pred_25'], squared=False)
print('Error metrics for predictions (averaged monthly)\n')
print('Depth_to_Groundwater_P24:')
print(f'  MAE:{mae_24}')
print(f'  RMSE:{rmse_24}')
print('Depth_to_Groundwater_P25:')
print(f'  MAE:{mae_25}')
print(f'  RMSE:{rmse_25}')

### 3. Quantifying uncertainty


The model forecasts a single prediction based on the historic (smoothed) averages of rain and temperature values. But it's very unlikely that this exact prediction will come true, or indeed that the temperature and rain values in the future will match the historic averages exactly. 

We can predict a *distribution* of possible outcomes based on two observed distributions (for simplicity, I'm assuming all distributions will be iid normal):
- The distribution of the smoothed rain and temperature values which we averaged to generate the expected rain and temperature predictions. This is the uncertainty created by unknown future rain and temperature parameters.
- The distribution of errors in the final delta-depth prediction over *known training values*. This is the uncertainty created by an inexact model fit to exact inputs


We can use these values in a Monte Carlo approach to estimating uncertanty:
1. Generate many different predictions based on randomized temperature and rain inputs - the temperature and rain will be drawn from random distributions based on the expected mean and variance.
2. Compute the  mean and variance of the resulting predictions
3. Factor in the variance of the model over the training data - since we are assuming normal distributions, we can simply add variances together.

In [None]:
def generate_randomized_prediction():
    def generate_randomized_input(row):
        dayofyear = int(row['dayofyear'])
        dayofyear = min(dayofyear, 365) # pretend leap days don't exist
        random_rain = np.random.normal(
            expectation_data['week_Rainfall_Bastia_Umbra'][dayofyear-1], 
            np.sqrt(var_data['week_Temperature_Bastia_Umbra'][dayofyear-1])
        )
        random_rain = max(0, random_rain) # no negative rain!
        random_temp = np.random.normal(
            expectation_data['week_Temperature_Bastia_Umbra'][dayofyear-1], 
            var_data['week_Temperature_Bastia_Umbra'][dayofyear-1]
        )
        return [random_rain,random_temp]

    randomized_prediction_input = depth_prediction_input.copy()
    randomized_prediction_input['dayofyear'] = petrignano_with_exp['dayofyear']
    randomized_vals = randomized_prediction_input.loc[train_cutoff:].apply(generate_randomized_input, axis=1, result_type='expand')
    randomized_prediction_input.loc[train_cutoff:, 'Rainfall_Bastia_Umbra'] = randomized_vals[0]
    randomized_prediction_input.loc[train_cutoff:, 'Temperature_Bastia_Umbra'] = randomized_vals[1]

    river_prediction = pred_func_onerain(randomized_prediction_input[['Rainfall_Bastia_Umbra']], randomized_prediction_input[['Temperature_Bastia_Umbra']].rolling(30).mean())

    randomized_prediction_input['Hydrometry_Fiume_Chiascio_Petrignano'] = river_prediction
    randomized_prediction_input = randomized_prediction_input.drop('dayofyear', axis=1).dropna()

    randomized_delta_depth_p25 = pd.Series(linreg_delta_depth_p25.predict(randomized_prediction_input), index=randomized_prediction_input.index)

    randomized_depth_pred_p25 = randomized_delta_depth_p25[train_cutoff:].cumsum()+last_train_depth25
    return randomized_depth_pred_p25

In [None]:

randomized_depth_pred_p25 = generate_randomized_prediction()
plt.subplots(figsize=(15,2.5))
petrignano.loc[datetime.date(2018,7,1):,'Depth_to_Groundwater_P25'].plot()
randomized_depth_pred_p25.plot()
plt.show()

The figure above shows a sample randomized prediction.

The figure below shows the mean and two-standard-deviation-wide span of the predicted distribution.
It was generated using 50 simulation rounds, which is probably not enough in practice to get consistent results; but a realistc simulation is a bit too time-consuming to run in a notebook that's trying to cover a lot of ground.

In [None]:
training_variance = (delta_depth_p25[:train_cutoff]-train_depth_data['delta_Depth_to_Groundwater_P25']).var()

random_predictions = [generate_randomized_prediction() for i in range(50)] # change 50 to run a different number of simulations

plt.subplots(figsize=(15,2.5))
random_mean = pd.DataFrame(random_predictions).mean()
random_variance = pd.DataFrame(random_predictions).var()
random_2std = np.sqrt(random_variance+training_variance)*2
petrignano.loc[datetime.date(2018,7,1):,'Depth_to_Groundwater_P25'].plot()
random_mean.plot()
plt.fill_between(random_mean.index, random_mean-random_2std, random_mean+random_2std, alpha=0.5)
plt.title('mean expected depth prediction with two-standard-deviation error area(well P25)')
plt.show()

This general technique for predicting uncertainty can be adapted for more nuanced analyses, as necesssary.

For example, what would happen if we had a dry year, where the rain tends to be one standard deviation below the average?

## Auser

### Data Overview

In [None]:
auser = pd.read_csv('../input/acea-water-prediction/Aquifer_Auser.csv')
fix_date(auser)

# limit to specific period with consistently good data
auser = auser.loc[datetime.date(2010,1,1): datetime.date(2020,1,1)]

rain_fields = ['Rainfall_Gallicano', 'Rainfall_Pontetetto',
       'Rainfall_Monte_Serra', 'Rainfall_Orentano', 'Rainfall_Borgo_a_Mozzano',
       'Rainfall_Piaggione', 'Rainfall_Calavorno', 'Rainfall_Croce_Arcana',
       'Rainfall_Tereglio_Coreglia_Antelminelli',
       'Rainfall_Fabbriche_di_Vallico']
depth_fields = [ 'Depth_to_Groundwater_LT2',
       'Depth_to_Groundwater_SAL', 'Depth_to_Groundwater_PAG',
       'Depth_to_Groundwater_CoS', 'Depth_to_Groundwater_DIEC']
temp_fields = ['Temperature_Orentano', 'Temperature_Monte_Serra',
       'Temperature_Ponte_a_Moriano', 'Temperature_Lucca_Orto_Botanico']
volume_fields = ['Volume_POL', 'Volume_CC1', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL']
hydro_fields = ['Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione']

# get rid of bad data: exactly 0 where it doesn't make sense
for t in temp_fields:
    auser.loc[auser[t]==0, t]=np.nan
for v in volume_fields:
    auser.loc[auser[v]==0, v]=np.nan
    

# resample volume as monthly since the data are clearly at monthly granularity
auser_resampled = pd.DataFrame()
for v in volume_fields:
    auser_resampled[v]=auser[v].resample('M').mean()

In [None]:
fig, axes = plt.subplots(5, figsize=(15,10), sharex=True)

axes[0].set_title('Rainfall (rolling 120 days)')
for r in rain_fields:
    auser[r].rolling(120).sum().plot(ax=axes[0])
axes[0].legend()

axes[1].set_title('Depth')
for d in depth_fields:
    auser[d].plot(ax=axes[1])
axes[1].legend()

axes[2].set_title('Temperature(rolling 30 days)')
for t in temp_fields:
    auser[t].rolling(30).sum().plot(ax=axes[2])
axes[2].legend()


axes[3].set_title('Volume withdrawn')
for v in volume_fields:
    auser_resampled[v].plot(ax=axes[3])
axes[3].legend()

axes[4].set_title('Hydrometry (river flow?)')
for h in hydro_fields:
    auser[h].plot(ax=axes[4])
axes[4].legend()

plt.tight_layout()
plt.show()

In [None]:
train_cutoff = datetime.date(2018,1,1)
auser_train = auser[:train_cutoff].copy()
auser_test = auser[train_cutoff+datetime.timedelta(1):]

### 1. Dependencies and Model

I used the same overall model as Petrignano (shown below)

In [None]:
petr_graph = Digraph(graph_attr={'ranksep':'1'})
vol_depth = Digraph(graph_attr={'rank':'same'})

petr_graph.node('R', 'Rainfall')
petr_graph.node('T', 'Temperature')
vol_depth.node('D', 'Change in Depth', shape='octagon', color='green')
petr_graph.node('H', 'River Hydrometry', color='blue')
vol_depth.node('V', 'Volume Withdrawn', style='dotted')

petr_graph.edge('R', 'H', color='blue')
petr_graph.edge('R', 'D', color='green')
petr_graph.edge('T', 'D', color='green')
petr_graph.edge('T', 'H', color='blue')
petr_graph.edge('H', 'D', color='green')
vol_depth.edge('D', 'V', style='dotted')
vol_depth.edge('V', 'D', style='dotted')


petr_graph.subgraph(vol_depth)
petr_graph

#### Modeling hydrometry

This model is quite similar to the Petrignano one.

For each hydrometry field, I tried using each rain field as the basis for the hydrometry model. `Rainfall_Fabbriche_di_Vallico` seems to be the best fit for both hydrometry fields.

For temperature inputs, I added several versions of rolling averages to the model  at once, to capture different aspects of seassonality. I then tried(offline, not in this notebook) removing some of the rolling averages to see if the BIC improved, but it did not, so it seems that all of these levels are important. 


In [None]:
auser_train['temp_30'] = auser_train['Temperature_Lucca_Orto_Botanico'].rolling(30).mean()
auser_train['temp_120'] = auser_train['Temperature_Lucca_Orto_Botanico'].rolling(90).mean()
auser_train['temp_180'] = auser_train['Temperature_Lucca_Orto_Botanico'].rolling(180).mean()

In [None]:
# find single best rain fit for predicting each hydrometry field
river_preds = {}
for river in hydro_fields:
    min_error = None
    min_rain = None
    for rain in rain_fields:
        pred_func=fit_rain_effects(
            rains_df = auser_train[[rain]], 
            ground_truth = auser_train[river], 
            additional_fields = auser_train[['Temperature_Lucca_Orto_Botanico', 'temp_30', 'temp_120', 'temp_180']],
            nrandom=3,
            ntotal=20,
            verbose=False, 
            x0=[
                [0.47802002829947315, 0.5011833749157925, 0.2943005663489582, 0],
                [0.4427317029133336, 0.5388455770005512, 0.9999999999999998, 0],
                [0.4535401381253068, 0.4836752048445926, 0.9999999999999998, 0],
                [0.48557892850101986, 0.5230183486520695, 0.4361481698207067, 0],
                [0.4919269314582723, 0.47413855100416424, 0.48034420622498836, 0],
                [0.5032878715390603, 0.4506610850117819, 0.9999999999999998, 0],
                [0.4427317029133336, 0.5388455770005512, 0.9999999999999998, 0],
                [0.48137685236410005, 0.4506610850117819, 0.9999999999999998, 0],
                [0.519754609419978, 0.4506610850117819, 0.8823067663440267, 0],
                [0.46288084149002906, 0.5332854256618504, 0.9999999999999998, 0],
                [0.456639047570966, 0.4506610850117819, 0.836729302031876, 0],
                [0.426807932210006, 0.4506610850117819, 0.9999999999999998, 0],
                [0.37089547099277526, 0.4506610850117819, 0.8705303183695998, 0],
            ]
        )
        pred = pred_func(auser_train[[rain]],  auser_train[['Temperature_Lucca_Orto_Botanico', 'temp_30', 'temp_120', 'temp_180']])
        err = (pred-auser_train[river]).var(ddof=0)
        if min_error is None or err < min_error:
            min_error = err
            min_rain = rain
            river_preds[river] = pred_func
        print()

    print(f'For {river}, the single best rain field is {min_rain} (error variance {min_error})')
    print()

In [None]:
fig, (ax1,ax2) = plt.subplots(2,figsize=(15,5))
fig.suptitle('Hydrometry: actual vs. predicted', fontsize=16)

pred = river_preds['Hydrometry_Monte_S_Quirico'](auser_train[['Rainfall_Fabbriche_di_Vallico']], auser_train[['Temperature_Lucca_Orto_Botanico', 'temp_30', 'temp_120', 'temp_180']])
auser_train['Hydrometry_Monte_S_Quirico'].plot(ax=ax1)
pred.plot(label='predicted', ax=ax1)
ax1.legend()
pred2 = river_preds['Hydrometry_Piaggione'](auser_train[['Rainfall_Fabbriche_di_Vallico']], auser_train[['Temperature_Lucca_Orto_Botanico', 'temp_30', 'temp_120', 'temp_180']])
auser_train['Hydrometry_Piaggione'].plot(ax=ax2)
pred2.plot(label='predicted', ax=ax2)
ax2.legend()
plt.show()

#### Modeling change in depth

Here, again, I used the same approach as for the Petrignano aquifer and used the BIC to guide my selection of parameters. Adding and removing various rain and temperature inputs did not have a large effect on the BIC of the output, so I picked a relatively simple option with relatively high BIC across the board.

Interestingly, even though this is a two-aquifer system where the artisinal aquifer's depth should ostensibly depend on the depth of the other aquifer, fitting all depths directly to the hydrometry data works really well; in fact, the fit is best for the well in the artisinal aquifer. This could mean that the rivers affect the artisinal aquifer directly, or that the effect of the parameters on the other aquifer gets "passed through" to the artisinal aquifer nearly instantly (or some combination of these explanations).

In [None]:
for d in depth_fields:
    auser_train['delta_'+d]=auser_train[d].diff()
    
deltadepth_train = auser_train[
    ['dayofyear', 'Rainfall_Fabbriche_di_Vallico','Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione', 'Temperature_Lucca_Orto_Botanico', 'temp_30', 'temp_120', 'temp_180',]+
    ['delta_'+d for d in depth_fields]
].dropna().copy()

linreg_x = deltadepth_train[['Rainfall_Fabbriche_di_Vallico','Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione', 'Temperature_Lucca_Orto_Botanico', 'temp_30' ]]

print('inputs:', list(linreg_x.columns))

delta_depth_fits = {}
for d in depth_fields:
    dd_fit = LinearRegression().fit(
        linreg_x,
        deltadepth_train['delta_'+d]
    )
    pred = dd_fit.predict(linreg_x)
    pred = pd.Series(pred, index=deltadepth_train.index)

    errvar = (pred-deltadepth_train['delta_'+d]).var(ddof=0)
    bic = get_bic(errvar, len(deltadepth_train), linreg_x.shape[1]+1)
    print(d, errvar, bic)
    delta_depth_fits[d] = dd_fit


### 2. Forecasting

In [None]:
def get_expected_inputs(input_data, dayofyear):
    week_rolling_mean = input_data.rolling(7, center=True).mean()
    expectations = defaultdict(list)
    variances = defaultdict(list)
    for d in range(365):
        nearby_days = np.arange(d-3, d+3)%365+1
        near_data = week_rolling_mean[dayofyear.isin(nearby_days)]
        for field in week_rolling_mean.columns:
            expectations[field].append(near_data[field].mean())
            variances[field].append(near_data[field].var())
    return expectations, variances

In [None]:
def gen_inputs_with_expectations(input_data, days_to_predict, expected_means):
    expected_data = input_data.append(pd.DataFrame(index=days_to_predict))
    expected_data['dayofyear'] = expected_data.index.dayofyear
    for field in input_data.columns:
        exp_df = pd.DataFrame(expected_means[field], index=range(1, 366), columns=[field+'_expected'])
        expected_data = expected_data.join(exp_df, on='dayofyear')
        expected_data[field] = expected_data[field].where(~expected_data[field].isnull(), expected_data[field+'_expected'])
        # expected_data[field] = expected_data[field].where(~expected_data.index.isin(list(days_to_predict)), expected_data[field+'_expected'])
    return expected_data


In [None]:
exp_means, exp_vars = get_expected_inputs(auser_train[['Rainfall_Fabbriche_di_Vallico', 'Temperature_Lucca_Orto_Botanico']], auser_train['dayofyear'])
exp_inputs = gen_inputs_with_expectations(
    auser_train[['Rainfall_Fabbriche_di_Vallico', 'Temperature_Lucca_Orto_Botanico']],
    auser_test.index,
    exp_means
)
exp_inputs['temp_30'] = exp_inputs['Temperature_Lucca_Orto_Botanico'].rolling(30).mean()
exp_inputs['temp_120'] = exp_inputs['Temperature_Lucca_Orto_Botanico'].rolling(90).mean()
exp_inputs['temp_180'] = exp_inputs['Temperature_Lucca_Orto_Botanico'].rolling(180).mean()

In [None]:
fig, axes = plt.subplots(len(hydro_fields), figsize=(15,7), sharex=True)

for h,ax in zip(hydro_fields, axes):
    exp_inputs[h] = river_preds[h](exp_inputs[['Rainfall_Fabbriche_di_Vallico']], exp_inputs[['Temperature_Lucca_Orto_Botanico', 'temp_30', 'temp_120', 'temp_180']])
    exp_inputs.loc[:train_cutoff,h].plot(ax=ax)
    exp_inputs.loc[train_cutoff:,h].plot(ax=ax)
    auser[h].plot(ax=ax)
    ax.set_title(h)
    ax.set_xlim(datetime.date(2018,2,1), None)
    

In [None]:
linreg_x = exp_inputs[['Rainfall_Fabbriche_di_Vallico','Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione', 'Temperature_Lucca_Orto_Botanico', 'temp_30' ]].dropna()

fig, axes = plt.subplots(len(depth_fields), figsize=(15,10), sharex=True)

predictions = {}
for d,ax in zip(depth_fields, axes):
    dd_fit = delta_depth_fits[d]
    pred_delta = dd_fit.predict(linreg_x)
    pred_delta = pd.Series(pred_delta, index=linreg_x.index)
    pred_delta = pred_delta[train_cutoff+datetime.timedelta(1):]
    d_c = auser_train.iloc[-1][d]
    pred = pred_delta.cumsum()+d_c
    predictions[d] = pred
    ax.plot(pred)
    auser[d].plot(ax=ax)
    ax.set_title(d)
    ax.set_xlim(train_cutoff, None)

In [None]:

for d in depth_fields:
    print(d)
    pred_df=None
    pred_df = auser[[d]].copy()
    pred_df['pred'] = predictions[d]
    pred_df.dropna(inplace=True)
    print('  MAE(daily, all years):', mean_absolute_error(pred_df[d],pred_df['pred']))
    print('  MAE(daily, one year):', mean_absolute_error(pred_df.loc[:datetime.date(2019,1,1),d],pred_df.loc[:datetime.date(2019,1,1),'pred']))
    print('  RMSE(daily, all years):', mean_squared_error(pred_df[d],pred_df['pred'], squared=False))
    print('  RMSE(daily, one year):', mean_squared_error(pred_df.loc[:datetime.date(2019,1,1),d],pred_df.loc[:datetime.date(2019,1,1),'pred'], squared=False))


We can see that the predictions start to deviate from reality after one year, but the general trend still tends to be in the right direction.



## Doganella and Luco

Omitted due to time constraints. Should be similar in concept, but without rivers (so only one layer of prediction).

In [None]:
petr_graph = Digraph(graph_attr={'ranksep':'1'})
vol_depth = Digraph(graph_attr={'rank':'same'})

petr_graph.node('R', 'Rainfall')
petr_graph.node('T', 'Temperature')
vol_depth.node('D', 'Change in Depth', shape='octagon', color='green')
vol_depth.node('V', 'Volume Withdrawn', style='dotted')

petr_graph.edge('R', 'D', color='green')
petr_graph.edge('T', 'D', color='green')
vol_depth.edge('D', 'V', style='dotted')
vol_depth.edge('V', 'D', style='dotted')


petr_graph.subgraph(vol_depth)
petr_graph

# Rivers
## Arno

In [None]:
arno_graph = Digraph(graph_attr={'ranksep':'1'})

arno_graph.node('R', 'Rainfall')
arno_graph.node('T', 'Temperature')
arno_graph.node('D', 'River Hydrometry', shape='octagon', color='green')

arno_graph.edge('R', 'D', color='green')
arno_graph.edge('T', 'D', color='green')


arno_graph

### Data Overview

In [None]:
arno = pd.read_csv('../input/acea-water-prediction/River_Arno.csv')
fix_date(arno)

# fix bad data: zeroed-out Hydrometry
arno.loc[arno['Hydrometry_Nave_di_Rosano']==0, 'Hydrometry_Nave_di_Rosano']  = np.nan

# rain near lake (same fields as in lake data; probably can use output flow instead)
lake_rain = [
    'Rainfall_Le_Croci',
    'Rainfall_Cavallina',
    'Rainfall_S_Agata',
    'Rainfall_Mangona',
    'Rainfall_S_Piero',
    'Rainfall_Vernio',
]

# rain along Arno river itself
river_rain = [
    'Rainfall_Stia',
    'Rainfall_Consuma',
    'Rainfall_Incisa', # closest
    'Rainfall_Montevarchi',
    'Rainfall_S_Savino',
    'Rainfall_Laterina',
    'Rainfall_Bibbiena',
    'Rainfall_Camaldoli',
]


arno['temp_30'] = arno['Temperature_Firenze'].rolling(30).mean()
arno['temp_120'] = arno['Temperature_Firenze'].rolling(90).mean()
arno['temp_180'] = arno['Temperature_Firenze'].rolling(180).mean()

rain_fields = lake_rain+river_rain

arno = arno.loc[:arno['Rainfall_Incisa'].last_valid_index()].copy()

arno.columns

In [None]:
fig, axes = plt.subplots(4, figsize=(15,10))

axes[0].set_title('Rainfall (Lake Bilancino area)')
for r in lake_rain:
    arno[r].rolling(120).sum().plot(ax=axes[0])
    
axes[1].set_title('Rainfall (along river)')
for r in river_rain:
    arno[r].rolling(120).sum().plot(ax=axes[1])

axes[2].set_title('Temperature')
arno['Temperature_Firenze'].plot(ax=axes[2])

axes[3].set_title('Hydrometry')
arno['Hydrometry_Nave_di_Rosano'].plot(ax=axes[3])
    
for ax in axes: 
    ax.legend()

This dataset has a lot of rain fields, but many of them actually are closer to the Bilancino lake and the Sieve river, and thus would only affect the Arno river through the outflow from Bilancino. I explored how the Bilanino flow output correlates to the Arno river (offline), and it seems the answer is "not well". This doesn't meaan that the lake doesn't affect arno; just that the magnitude of the effect can't be derived from knowing what happenss to the lake (for example, the effect could be mostly constant compared to the effect of rain near the river.)

For this reason, I'll only be using the "river rainfall" fields in my model.

### 1. Dependencies and model

This is a single-layer model where the rainfall from several areas is combined with temperature data to predict the river's hydrometry metric.

The additional interesting feature of a long river is that using several rain fields, with different parameters, may be appropriate here. Unfortunately, there are only about 3.5 years  of data which actually contain all the important rain fields.(out of over 20 years of data total). To estimate whether it makes more sense to limit the dataset to 3.5 years or to only use one rain field, I first looked at how the fit is affected by limiting the data.

In [None]:
print('Rainfall_Incisa, all available data:')
pred_func=fit_rain_effects(
    rains_df = arno[['Rainfall_Incisa']], 
    ground_truth = arno['Hydrometry_Nave_di_Rosano'], 
    additional_fields = arno[['Temperature_Firenze','temp_30', 'temp_120', 'temp_180']],
    nrandom=3,
    ntotal=20,
    verbose=False, 
    x0=[
        [0.4302223831313535, 0.4506610850117819, 0.7521851546976578, 0],
        [0.4313650849509147, 0.5012229150866508, 0.9788660353174939, 0],
        [0.3496183131045193, 0.5784179384080085, 0.874943754380722, 0],
    ]
)

print()
print('Rainfall_Incisa, up to 2007-7-6:')

arno_slice = arno.loc[:datetime.date(2007,7,6)]
pred_func=fit_rain_effects(
    rains_df = arno_slice[['Rainfall_Incisa']], 
    ground_truth = arno_slice['Hydrometry_Nave_di_Rosano'], 
    additional_fields = arno_slice[['Temperature_Firenze','temp_30', 'temp_120', 'temp_180']],
    nrandom=3,
    ntotal=20,
    verbose=False, 
    x0=[
        [0.4302223831313535, 0.4506610850117819, 0.7521851546976578, 0],
        [0.4313650849509147, 0.5012229150866508, 0.9788660353174939, 0],
        [0.3496183131045193, 0.5784179384080085, 0.874943754380722, 0],
    ]
)

The fact that the error is signinficantly smaller with less data does suggest that there's overfitting.

And although our simplifed BIC formula with Gaussian assumptions does not completely apply, because it also asssumes equal input size for apples-to-apples-comparison, the BIC is also a *lot* worse for the smaller data set (which is to be expected).

Imputing the rain data would have been possible, but would lose the benefit of having several independent measurements of rain.

Given that, I decided to work with just one rain field - the one near-river field which has the most data available, Rainfall_Incisa.

I did, however, fit **two copies** of this rain field to two rain effect models: one came out to represent a more immediate effect, while the other one had a slower and delayed effect. One feasible explanation is tha the quicker effect is from water running down the surface, while the slower effect is from water that seeped into the ground before eventually propagating into the river.

In [None]:
train_cutoff = datetime.date(2014,1,1)
arno_train = arno[:train_cutoff].copy()
arno_test = arno[train_cutoff+datetime.timedelta(1):]

In [None]:
arno_train['rain_copy'] = arno_train['Rainfall_Incisa']
arno_pred_func=fit_rain_effects(
    rains_df = arno_train[['Rainfall_Incisa', 'rain_copy']], 
    ground_truth = arno_train['Hydrometry_Nave_di_Rosano'], 
    additional_fields = arno_train[['Temperature_Firenze','temp_30', 'temp_120', 'temp_180']],
    nrandom=3,
    ntotal=10,
    verbose=False, 
    x0=[
        [0.33084597196099713, 0.7646045780132397, 0.5477330014465549, 0, 0.534142242067082, 0.5492149158336986, 0.9893596672770165, 5]
    ]
)

### 2. Forecasting

In [None]:
exp_means, exp_vars = get_expected_inputs(
    arno_train[['Rainfall_Incisa', 'Temperature_Firenze']], arno_train['dayofyear'])
exp_inputs = gen_inputs_with_expectations(
    arno_train[['Rainfall_Incisa', 'Temperature_Firenze']],
    arno_test.index,
    exp_means
)

exp_inputs['rain_copy'] = exp_inputs['Rainfall_Incisa']
exp_inputs['temp_30'] = exp_inputs['Temperature_Firenze'].rolling(30).mean()
exp_inputs['temp_120'] = exp_inputs['Temperature_Firenze'].rolling(90).mean()
exp_inputs['temp_180'] = exp_inputs['Temperature_Firenze'].rolling(180).mean()

In [None]:
# exp_inputs=exp_inputs.loc[datetime.date(2014,1,2):exp_inputs['Rainfall_Incisa'].last_valid_index()]
pred_flow = arno_pred_func(exp_inputs[['Rainfall_Incisa', 'rain_copy']], exp_inputs[['Temperature_Firenze','temp_30', 'temp_120', 'temp_180']])

In [None]:
plt.subplots(figsize=(15,5))
plt.plot(pred_flow, label='predicted')
plt.plot(arno['Hydrometry_Nave_di_Rosano'], label='Hydrometry_Nave_di_Rosano')
plt.legend()
plt.xlim(datetime.date(2014,1,1),exp_inputs['Rainfall_Incisa'].last_valid_index())
plt.show()

In [None]:
pred_df = arno[['Hydrometry_Nave_di_Rosano']].copy()
pred_df['pred'] = pred_flow
pred_df.dropna(inplace=True)
d='Hydrometry_Nave_di_Rosano'
print('  MAE(daily, two years):', mean_absolute_error(pred_df[d],pred_df['pred']))
print('  MAE(daily, one year):', mean_absolute_error(pred_df.loc[:datetime.date(2015,1,1),d],pred_df.loc[:datetime.date(2015,1,1),'pred']))
print('  RMSE(daily, two years):', mean_squared_error(pred_df[d],pred_df['pred'], squared=False))
print('  RMSE(daily, one year):', mean_squared_error(pred_df.loc[:datetime.date(2015,1,1),d],pred_df.loc[:datetime.date(2015,1,1),'pred'], squared=False))

Interestingly, the error metrics ar only slightly worse across both years than for the first year, even though on the graph the first year's trend seems to match the actual flow much more closely.

# Lakes
## Bilancino


*Note*: The implementation below is buggy, and therefore does not produce great results. [This correction](https://www.kaggle.com/yanamal/lake-bilancino-prediction#Bilancino) demonstrates that the general approach is still viable, and works well when implemented correctly. Unfortunately, it took me a couple of days after the deadline to fix all the bugs.

(and then I never heard back despite repeatedly asking whether it's appropriate to insert notes like this into subsequent versions of this notebook, so now I'm just guessing)

### Data Overview

In [None]:
bilancino=pd.read_csv('../input/acea-water-prediction/Lake_Bilancino.csv')
fix_date(bilancino)

rain_fields = ['Rainfall_S_Piero', 'Rainfall_Mangona', 'Rainfall_S_Agata', 'Rainfall_Cavallina', 'Rainfall_Le_Croci']

# The mean flow out of the lake (Flow_Rate) yesterday
bilancino['flow_mean_yesterday'] = bilancino['Flow_Rate'].rolling(2).mean()

bilancino['delta_level'] = bilancino['Lake_Level'].diff()


bilancino['temp_30'] = bilancino['Temperature_Le_Croci'].rolling(30).mean()
bilancino['temp_120'] = bilancino['Temperature_Le_Croci'].rolling(90).mean()
bilancino['temp_180'] = bilancino['Temperature_Le_Croci'].rolling(180).mean()


bilancino.columns

In [None]:
fig, axes = plt.subplots(4, figsize=(15,10))

for r in rain_fields:
    bilancino[r].rolling(120).sum().plot(ax=axes[0])
    
bilancino['Temperature_Le_Croci'].plot(ax=axes[1])

bilancino['Lake_Level'].plot(ax=axes[2])

bilancino['Flow_Rate'].plot(ax=axes[3])
    
for ax in axes:
    ax.legend()

In [None]:
train_cutoff = datetime.date(2016,1,1)
b_train = bilancino[:train_cutoff].copy()
b_test = bilancino[train_cutoff+datetime.timedelta(1):]

### 1. Dependencies and model

In [None]:
lake_graph = Digraph(graph_attr={'ranksep':'1'})

lake_graph.node('R', 'Rainfall')
lake_graph.node('T', 'Temperature')
lake_graph.node('F', 'Flow out of dam', shape='octagon', color='blue')
lake_graph.node('L', 'Lake Level', shape='octagon', color='green')

lake_graph.edge('R', 'L', color='green')
lake_graph.edge('T', 'L', color='green')
lake_graph.edge('F', 'L', color='green')
lake_graph.edge('L', 'F', color='blue')

lake_graph

The Bilancino lake is an interessting cases, becaues its behavior depends on a man-made and human-operated structure - the dam that created the lake.

The level of water in the lake depends on the flow out of the dam, but the flow out of the dam also depends on the lake level - specifically, the lake level determines the amount of pressure created, and therefore the strengt of the flow.

According to the challenge description, water is let out of the dam quickly at certain times, and allowed to collect at other times.

Further, the data indicates that the flow spikes drastically when the lake level goes over a certain point - This seems to be the dam's spillwaay being activated. According to [this site](http://cmcgruppo.com/cmc/en/project/bilancino-dam/), the spillway has an automatic flap gate. This means that the gate opens wider when pressure increases, which makess the interaxtion even more complicated.

#### Modeling change in lake level

In my previous experimentation, I found that two of the rainfall fields contain most of the information: `Rainfall_Mangona` and `Rainfall_Cavallina`. This makes sense:`Rainfall_Cavallina` is the closest location to the lake iteslf, and `Rainfall_Mangona` is located over the Sieve river, before the rver flows into the lake. Therefore, it captures information about water which enters the lake via the river.

The new component here is how flow rate affects lake level: the amount of water that leaves the lake via the dam (or more precisely, *the amount that left yesterday* is proportional to the subsequent reduction in lake level. So unlike the temerature parameters, **The parameters of flow in the linear regression actually have a direct physical meaning**. specifically, the scaling coefficient tells us how to convert between flow rate and (change in) lake level.

In [None]:
b_pred_func=fit_rain_effects(
    rains_df = b_train[['Rainfall_Mangona', 'Rainfall_Cavallina']], 
    ground_truth = b_train['delta_level'], 
    additional_fields = b_train[['flow_mean_yesterday', 'Temperature_Le_Croci','temp_30', 'temp_120', 'temp_180']],
    nrandom=3,
    ntotal=10,
    verbose=False, 
    x0=[
        [0.15233691562216556, 0.8854084902188695, 0.9999999999999998, 1, 0.13666937295698817, 0.9979837155391087, 0.14202766104040032, 0],
        [0.0474258731775668, 0.9999999974825013, 0.570952205261587, 1, 0.3850443032078931, 0.45805440829961064, 0.545042159616547, 0],
    ]
)

#### Modeling flow out of lake

As I mentioned above, the flow out of the lake is a complex process which depends on several factors, including human behavior.

I tentatively separated it into three typess of flow:
- "normal" flow
- "drain" flow, when the water is being drained from the lake through the dam's intake process
- "spillway" flow, when the lake level is high enough that the spillway is active.

My manual guess about when each happens is shown below:
(lake level is on one axis, in dashed lines, and flow rate is on anoter, in a solid line)

In [None]:
# label specific types of flow

# dates when "intake" (from the lake into the river) seems to happen
start_intake = 180# pd.to_datetime(datetime.date(2008, 7, 1)).dayofyear
end_intake = pd.to_datetime(datetime.date(2008, 11, 1)).dayofyear

bilancino.loc[(bilancino['dayofyear'] >= start_intake) & (bilancino['dayofyear'] <= end_intake), 'flow_type']='intake'

# spillway is active
# bilancino.loc[bilancino['Flow_Rate'] > 7.8, 'flow_type']='spillway'
bilancino.loc[bilancino['Lake_Level'] > 251.5, 'flow_type']='spillway'

fig, ax1 = plt.subplots(figsize=(15, 5))
ax2 = ax1.twinx()
bilancino['Lake_Level'].plot(ax=ax1, color='orange', linestyle='dashed')
bilancino.where(bilancino['flow_type']=='intake')['Lake_Level'].plot(ax=ax1, color='blue', linestyle='dashed')
bilancino.where(bilancino['flow_type']=='spillway')['Lake_Level'].plot(ax=ax1, color='red', linestyle='dashed')
bilancino['Flow_Rate'].plot(ax=ax2, color='orange', label='normal flow')
bilancino.where(bilancino['flow_type']=='intake')['Flow_Rate'].plot(ax=ax2, color='blue', label='draining')
bilancino.where(bilancino['flow_type']=='spillway')['Flow_Rate'].plot(ax=ax2, color='red', label='spillway')
plt.legend()
plt.show()


However, I found it very hard to fit parameters to expeted flow rates in each case. There seems to be a lot of non-linearity in the relationships; additionally, the relationship potentially changes in the last two years - the scatterplot below shows the relationship between flow rate and lake level; we can clearly see two greenish-yellow lines that do not conform to the pattern. these represent spillway flow after 2018 - it appears that the spillway started activating eariler. Again, human behavior makes things less predictable.

For this reason, I chose to cut off the training dataset at 2016, and focus on prediciting 2017. We can also see what happens to the subsequent years, and whether this shift changes things.

In [None]:
plt.scatter(bilancino['Lake_Level'],bilancino['Flow_Rate'], marker='x',c=bilancino['Date'].dt.year)
plt.show()

Because this is a complex and piecewise relationship, decision trees, and specifically boosted forests (aka LGBM) are a good fit.

I generated some derived features for the LGBM which capture my beliefs about the important pieces of this puzzle:
- my best guess as to how the flow should behave when the spillway is active (I used a similar analysis to finding the optimal temperature averaging window for Petrignano's delta-level)
- my best guess about when draining typically starts, ends, and how it ramps up (just from eyeballing the data)
- stats about the last 60 days of lake level - because I suspect there is some inertia in the system.

In [None]:
lgbm_input = bilancino[['Lake_Level']].copy() #, 'dayofyear'

flow_lake_conversion = 0.007661467849044978 # from lake level analysis above

lgbm_input['effect'] = bilancino['Lake_Level']+(bilancino['flow_mean_yesterday']*flow_lake_conversion).cumsum()
lgbm_input.drop('Lake_Level',axis=1, inplace=True)
lgbm_input['rolling_min'] = lgbm_input['effect'].rolling(60).min()
lgbm_input['rolling_max'] = lgbm_input['effect'].rolling(60).max()
lgbm_input['rolling_mean'] = lgbm_input['effect'].rolling(60).mean()

inlet_start = 160 # Nth day in year
rampup_end = 220
inlet_end = 360# end_intake  # stop "inlet" drain
inlet_rampup = bilancino['dayofyear']
inlet_rampup = (inlet_rampup-inlet_start)/(rampup_end-inlet_start)
inlet_rampup[(bilancino['dayofyear']<inlet_start) | (bilancino['dayofyear']>inlet_end)] = 0
inlet_rampup[(bilancino['dayofyear']>rampup_end) & (bilancino['dayofyear']<=inlet_end)] = 1

lgbm_input['inlet_rampup'] = inlet_rampup

lgbm_test_cutoff = datetime.date(2017, 1, 1)
lgbm_input = lgbm_input.loc[:lgbm_test_cutoff].copy()
lgbm_flow_rate = bilancino.loc[:lgbm_test_cutoff, 'Flow_Rate']


X_train = lgbm_input.loc[:train_cutoff]
y_train = lgbm_flow_rate[:train_cutoff]
X_test = lgbm_input.loc[train_cutoff:]
y_test = lgbm_flow_rate[train_cutoff:]

reg = LGBMRegressor().fit(X_train, y_train)

fig, ax1 = plt.subplots(figsize=(15, 5))

bilancino['Flow_Rate'].plot(ax=ax1)
ax1.plot(X_train.index,reg.predict(X_train))
ax1.plot(X_test.index,reg.predict(X_test))
plt.xlim(None, lgbm_test_cutoff)
plt.show()

In [None]:
train_pred = pd.Series(reg.predict(X_train), index=X_train.index)
test_pred = pd.Series(reg.predict(X_test), index=X_test.index)

print('train error variance:',(train_pred-bilancino['Flow_Rate']).var(),'\ntest error variance:', (test_pred-bilancino['Flow_Rate']).var())

### 2. Forecasting

Forecasting is tricky because of the feedbaack look between flow and lake level.

below, I am forecasting slightly custom values to account for the oddities:

1. The *cumulative lake level* which would exist if the dam did not have any drain - to do this, I am doing a custom prediction using all the parameters generated by the rain effect model, **but putting yesterday's flow on the other side of the equation**. I am comparing that to the actual Lake_Level **plus flow, also scaled  according to the parameter from the trained model**

2. I am using this cumulative level as input for the LGBM to predict flow out - actually, that's how the LGBM was trained.

3. I am using the predicted flow and cumulative level to try to get the actual level back.

The result deviates from reality quickly. The LGBM may be partially to blame - while all of thte other models were written explicitly to respect the physics of the situation, the LGBM is unaware of them. Having more information abot the dam may allow for a model that avoids such black-box systems.





In [None]:
exp_means, exp_vars = get_expected_inputs(
    b_train[['Rainfall_Mangona', 'Rainfall_Cavallina', 'Temperature_Le_Croci']], b_train['dayofyear'])
exp_inputs = gen_inputs_with_expectations(
    b_train[['Rainfall_Mangona', 'Rainfall_Cavallina', 'Temperature_Le_Croci']],
    b_test.index,
    exp_means
)

exp_inputs['temp_30'] = exp_inputs['Temperature_Le_Croci'].rolling(30).mean()
exp_inputs['temp_120'] = exp_inputs['Temperature_Le_Croci'].rolling(90).mean()
exp_inputs['temp_180'] = exp_inputs['Temperature_Le_Croci'].rolling(180).mean()

exp_inputs.loc[train_cutoff:,'flow_mean_yesterday'] = 0

In [None]:
'''
Parameters for Rainfall_Mangona: [2.220446049250313e-16, 1.0000000011740589, 191.42948720854787, 1]
Parameters for Rainfall_Cavallina: [0.7595424520378065, 0.0015911180194193453, 187.22633570423756, 0]
Scaling:
  Rainfall_Mangona: 0.005862430953008586
  Rainfall_Cavallina: 1.4547492793031174
  flow_mean_yesterday: -0.0077987173442005675
  Temperature_Le_Croci: 0.0039183372980673425
  temp_30: -0.0015752030857040245
  temp_120: -0.008107915323914416
  temp_180: 0.0048819421510388206
Translation parameter: -0.010936677349591556
raw gp_minimize parameters: [0.0474258731775668, 0.9999999974825013, 0.570952205261587, 1, 0.3850443032078931, 0.45805440829961064, 0.545042159616547, 0]
error value: 0.004762807981376231
BIC (assuming error metric is error variance): -22344.942068760232
'''
# manually apply parameters to put flow_mean_yesterday (and its factor) on the other side of the equation

Rainfall_Mangona_prediction = pd.Series(rainfall_effect(
    exp_inputs['Rainfall_Mangona'],
    fraction_retained = 2.220446049250313e-16, 
    first_day_flow = 1.0, 
    funnel_start_width = 191.42948720854787, 
    time_gap = 1,
))

Rainfall_Cavallina_prediction = pd.Series(rainfall_effect(
    exp_inputs['Rainfall_Cavallina'],
    fraction_retained = 0.7595424520378065, 
    first_day_flow = 0.0015911180194193453, 
    funnel_start_width = 187.22633570423756, 
    time_gap = 0,
))

In [None]:
pred_cum_delta = 0.005862430953008586*Rainfall_Mangona_prediction+\
1.4547492793031174*Rainfall_Cavallina_prediction+\
0.0039183372980673425*exp_inputs['Temperature_Le_Croci']+\
-0.0015752030857040245*exp_inputs['temp_30']+\
-0.008107915323914416*exp_inputs['temp_120']+\
0.0048819421510388206*exp_inputs['temp_180']+\
-0.010936677349591556

cumulative level

In [None]:
pred_cum_level = pred_cum_delta[train_cutoff:].cumsum()+b_train.iloc[-1]['Lake_Level']
pred_cum_level.plot()
b_train['Lake_Level'].plot()
plt.xlim(datetime.date(2015, 1, 1), lgbm_test_cutoff)

flow prediction

In [None]:
X_test = X_test.copy()
X_test['effect'] = pred_cum_level
X_test['rolling_min'] = X_test['effect'].rolling(60).min()
X_test['rolling_max'] = X_test['effect'].rolling(60).max()
X_test['rolling_mean'] = X_test['effect'].rolling(60).mean()

flow_pred = pd.Series(reg.predict(X_test), index=X_test.index,name='cumlev' )

fig, ax1 = plt.subplots(figsize=(15, 5))

bilancino['Flow_Rate'].plot(ax=ax1)
ax1.plot(X_train.index,reg.predict(X_train))
ax1.plot(X_test.index,reg.predict(X_test))
plt.xlim(datetime.date(2015, 1, 1), lgbm_test_cutoff)

In [None]:
pred = pred_cum_level.to_frame( name='cumlev')
pred['flow'] = flow_pred
pred['lev'] = pred['cumlev']-(pred['flow']*0.0077987173442005675).cumsum()
(pred['lev']).plot()

bilancino['Lake_Level'].plot()
plt.xlim(datetime.date(2015, 1, 1), lgbm_test_cutoff)

# Streams
omitted due to time constraints (but would work in a very ssimilar way to rivers, or in the case of Amiata, to a river flowing from a built-in aquifer - calculate the aquifer depth and use linear regression on depths to get flow (because flow is based on water pressure in the aquifer)