In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import datetime

from collections import defaultdict

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error

from skopt import gp_minimize
from skopt.space import Real, Integer

from graphviz import Digraph
from IPython.display import SVG

import warnings

from lightgbm import LGBMRegressor

In [None]:
# fix the date in a dataframe (pandas does not read it in correctly by default), set it as index
def fix_date(df):
    df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y")
    df.set_index('Date', drop=False, inplace=True)
    df['dayofyear'] = df['Date'].dt.dayofyear

In [None]:
# The main rainfall effect model

def rainfall_effect(
    # data parameters
    rain_series, # pd series where index is date and value is amount of rain which fell that day
    start_date = None,
    end_date = None,
    # simulation parameters
    fraction_retained = 0.9, # fraction of water retained each day (vs. fraction that is carried away elsewhere - pooling, transpiration, etc.)
    first_day_flow = 0.004, # what fraction of the rain takes effect on the first day
    funnel_start_width = 0.0, # when 0, funnel is cone-shaped; when large, funnel is closer to cyllinder-shaped.
    time_gap = 0, # integer(days) - how long does it take even the first water to reach the area of interest
):
    # calculate default start and end date
    if start_date is None:
        start_date = rain_series.first_valid_index()
    if end_date is None:
        end_date = rain_series.last_valid_index()
    
    # calculate flow speed per "funnel unit"
    first_day_area = funnel_start_width+1.0
    flow_speed = first_day_flow/first_day_area
    
    # total rain "taking effect" on a given day
    rain_effect = defaultdict(int)
    
    # process rain coming in each day
    current_date = start_date
    while current_date <= end_date:
        rain = 0
        if current_date in rain_series.index:
            rain = rain_series[current_date]
        
        # start with explicit 0 for each input date      
        if current_date not in rain_effect:
            rain_effect[current_date] = 0
            
        # iterate through upcoming days and calculate effect of rain which reaches the body *on that day*
        retained_remaining = rain # this variable keeps track of the effect of retention/drainage (but ignores actual outflow)
        # effectively,it is used to infer how much water is left which originated on a given "daily level" of the funnel.
        total_water_remaining = rain # actual amount of rain remaining "un-claimed"
        current_area_factor = first_day_area
        rain_effect_date = current_date + datetime.timedelta(int(time_gap))
        while retained_remaining >= 0.01 and total_water_remaining >= 0.01 and rain_effect_date <= end_date:
            water_out = current_area_factor*flow_speed*retained_remaining
            water_out = min(water_out, total_water_remaining)
        
            # update totals
            rain_effect[rain_effect_date] += water_out
            total_water_remaining -= water_out
            
            # update running state variables
            retained_remaining *= fraction_retained
            total_water_remaining *= fraction_retained
            current_area_factor += 2
            rain_effect_date += datetime.timedelta(1)
            
        current_date += datetime.timedelta(1)
    
    return rain_effect

In [None]:
# error metric: global error variance
def global_error_var(correct, pred):
    return (correct-pred).var(ddof=0)

In [None]:
def local_error_var(correct, pred):
    return (correct-pred).rolling(10).var(ddof=0).mean()

In [None]:
# Helper functions to be able to define more elaborate priors for gp_optimize than just "uniform" and "loguniform"
 
# reverse a log-uniform prior so that bigger values are more likely 
def make_inverse_loguniform_prior(name, lower=None, upper=None):
    
    if lower is None:
        lower = 0 + np.finfo(float).eps
    if upper is None:
        upper = 1 - np.finfo(float).eps
        
    def convert(x):
        return upper-x
    
    dimension = Real(0 + np.finfo(float).eps, upper-lower, name=name, prior='log-uniform')
    
    return convert, dimension

# set up a dimension such that the resulting converted variable will have a logistic distribution 
# with given s, mu, and lower/upper bounds.
def make_logit_prior(name, s = 1, m = 0, lower=None, upper=None):

    # (0+np.finfo(float).eps)
    
    # x should be between 0 and 1
    def convert_using_logit(x):
        return m+np.log(x/(1-x))*s
    
    lower_x = 0 + np.finfo(float).eps
    if lower is not None:
        lower += np.finfo(float).eps
        lower_x = 1/(1+np.exp((m-lower)/s))
    upper_x = 1 - np.finfo(float).eps
    if upper is not None:
        upper -= np.finfo(float).eps
        upper_x = 1/(1+np.exp((m-upper)/s))
    
    dimension = Real(lower_x, upper_x, name=name)
    
    return convert_using_logit, dimension

In [None]:
# wrap rain effect calculation into a format which can be plugged into gp_optimize, with custom priors.
def make_rain_func_and_dimensions(
    rain_series, 
    retained_prior = None,
    flow_prior = None,
    width_prior = None,
    lag_prior = None,
    verbose=True
):
    # TODO: take  in priors?..
    
    # How to convert given input variable (generate defaults if not provided in parameters)
    if retained_prior is None:
        # retained_conv, retained_dim = make_inverse_loguniform_dim('fraction_retained')
        retained_conv, retained_dim = make_logit_prior('fraction_retained', s=0.3, m=0.9, lower=0.0, upper=1.0)
    else:
        retained_conv, retained_dim = retained_prior
        
    if flow_prior is None:
        # we set the lower bound to 0.0001 for practical reasons: lower values don't make much of a cumulative effect, 
        # but they do take a long time to compute because the effect of each day's rainfall is spread out over many more days.
        flow_conv, flow_dim = make_logit_prior('first_day_flow', s=0.05, m=0.01, lower=0.0001, upper=1.0) #m=0.1, s=0.1?
    else:
        flow_conv, flow_dim = flow_prior
        
    if width_prior is None:
        width_conv, width_dim = make_logit_prior('funnel_start_width', s=40.0, m=180.0, lower=0.0) # m=270?
    else:
        width_conv, width_dim = width_prior
    
    if lag_prior is None:
        lag_conv = lambda x: x
        lag_dim = Integer(0,10, name='time_gap')
    else:
        lag_conv, lag_dim = lag_prior
    
    conversions = [retained_conv, flow_conv, width_conv, lag_conv]
    dimensions = [retained_dim, flow_dim, width_dim, lag_dim]
    
    def convert(x):
        return [conversions[i](x_i) for (i, x_i) in enumerate(x)]
    
    def calc_rain_from_vector(x):
        fraction_retained, first_day_flow, funnel_start_width, time_gap = convert(x)
        
        if verbose:
            print('inputs:', fraction_retained, first_day_flow, funnel_start_width, time_gap)
            
        return pd.Series(rainfall_effect(
            # data parameters
            rain_series,
            # simulation parameters
            fraction_retained = fraction_retained,
            first_day_flow = first_day_flow,
            funnel_start_width = funnel_start_width,
            time_gap=time_gap
        ))

    return calc_rain_from_vector, dimensions, convert

In [None]:
# use Baysian optimization with linear regresssion to fit ML model of rain effects, plus any set of linear-effect parameters.
def fit_rain_effects(
    # inputs
    rains_df, 
    ground_truth, 
    make_rain_func=make_rain_func_and_dimensions,
    additional_fields=None, # additional fields to throw into linear regression
    # options
    error_func = global_error_var,
    spinup=30, # TODO: use?..
    verbose=True,
    nrandom=20,
    ntotal=100,
    x0=None, # optional input point(s) to try for gp_minimize; e.g. best overall results of previous runs
):
    calcs = []
    rain_names = []
    converts = []
    all_dims = []
    dims_per_rain = 0
    for rain_name, rain_series in rains_df.iteritems():
        calc, dims, convert = make_rain_func(rain_series, verbose=verbose)
        dims_per_rain = len(dims)
        calcs.append(calc)
        converts.append(convert)
        rain_names.append(rain_name)
        all_dims += dims
        
    reg_fields =rain_names
    if additional_fields is not None:
        reg_fields += list(additional_fields.columns)
    
    optimal_error = None
    optimal_linreg = None
    optimal_n = None
    
    def calculate_error(x):
        prediction_frame = ground_truth.to_frame(name='ground_truth')
        
        for i, rain_calc in enumerate(calcs):
            if(verbose):
                print(rain_names[i])
            rain_result = rain_calc(x[dims_per_rain*i:dims_per_rain*(i+1)])
            prediction_frame[rain_names[i]] = rain_result
            prediction_frame.loc[(prediction_frame['ground_truth'].notnull()) & (prediction_frame[rain_names[i]].isnull()), rain_names[i]] = 0

        if additional_fields is not None:
            prediction_frame[list(additional_fields.columns)] = additional_fields
                
        without_nulls = prediction_frame.dropna().copy()
        
        reg = LinearRegression().fit(without_nulls[reg_fields], without_nulls['ground_truth'])
        if verbose:
            print('rescale parameters:', reg.coef_, reg.intercept_)

        without_nulls['pred'] = reg.predict(without_nulls[reg_fields])
        
        error = error_func(without_nulls['ground_truth'], without_nulls['pred'])
        if verbose:
            print('error value:', error)
            print()
            
        nonlocal optimal_error, optimal_linreg, optimal_n
        if optimal_error is None or error < optimal_error:
            optimal_error = error
            optimal_linreg = reg
            optimal_n = len(without_nulls)
        
        return error
        
    res = gp_minimize(
        calculate_error,  # function to minimize
        all_dims,             # dimension configuration
        acq_func="gp_hedge",    # acquisition function (PI = optimize probability of reducing error; 'gp_hedge' - guess/vary)
        n_calls=ntotal,      # number of evaluations of f
        n_random_starts=nrandom, # first n calls are random (avoid local minima) 
        x0=x0, # input points to definitely try
    )  
    
    additional_names = []
    if additional_fields is not None:
        additional_names = list(additional_fields.columns)
    
    return generate_prediction_function(rain_names, additional_names, res, converts, optimal_linreg, optimal_n)

In [None]:
# Calculate BIC from error variance (making the gaussian assumption)
def get_bic(errvar, n, k):
    return n*np.log(errvar) + k*np.log(n)

In [None]:
# given the outputs of a rain effect model fit, return a function which will generate the predictions based on that model.
def generate_prediction_function(rain_names, additional_field_names, fit_result, conversions, optimal_linreg, training_n, verbose=True):
    
    # get converted parameters for rain effect calculations
    dims_per_rain = len(fit_result.x)//len(conversions)
    all_rain_params = []
    for i, convert in enumerate(conversions):
        params = convert(fit_result.x[dims_per_rain*i:dims_per_rain*(i+1)])
        all_rain_params.append(params)
    
    if verbose:
        for name, params in zip(rain_names, all_rain_params):
            print(f'Parameters for {name}: {params}')
        print('Scaling:')
        for name, coef in zip(rain_names+additional_field_names, optimal_linreg.coef_):
            print(f'  {name}: {coef}')
        print(f'Translation parameter: {optimal_linreg.intercept_}')
        print(f'raw gp_minimize parameters: {fit_result.x}')
        print(f'error value: {fit_result.fun}')
        print(f'BIC (assuming error metric is error variance): {get_bic(fit_result.fun, training_n, len(fit_result.x)+len(optimal_linreg.coef_)+1)}')
    
    # function to generate prediction from trained parameters
    def predict_from_rain(rain_fields, additional_fields=None):
        pred_df = pd.DataFrame(index=rain_fields.index)
        
        for rain_name, rain_params in zip(rain_names, all_rain_params): 
            rain_series = rain_fields[rain_name]
            fraction_retained, first_day_flow, funnel_start_width, time_gap = rain_params
            rain_pred = pd.Series(rainfall_effect(
                rain_series,
                fraction_retained = fraction_retained, 
                first_day_flow = first_day_flow,
                funnel_start_width = funnel_start_width,
                time_gap = time_gap,
            ))
            pred_df[rain_name] = rain_pred
            
        if additional_fields is not None:
            pred_df[list(additional_fields.columns)] = additional_fields
            
        pred_df.dropna(inplace=True)
        return pd.Series(optimal_linreg.predict(pred_df), index=pred_df.index)
            
    return predict_from_rain


In [None]:
def get_expected_inputs(input_data, dayofyear):
    week_rolling_mean = input_data.rolling(7, center=True).mean()
    expectations = defaultdict(list)
    variances = defaultdict(list)
    for d in range(365):
        nearby_days = np.arange(d-3, d+3)%365+1
        near_data = week_rolling_mean[dayofyear.isin(nearby_days)]
        for field in week_rolling_mean.columns:
            expectations[field].append(near_data[field].mean())
            variances[field].append(near_data[field].var())
    
    for field in week_rolling_mean.columns:
        expectations[field].append(expectations[field][-1])# hack to do  *something* about leap years.
        variances[field].append(variances[field][-1])
    return expectations, variances

In [None]:
def gen_inputs_with_expectations(input_data, days_to_predict, expected_means):
    expected_data = input_data.append(pd.DataFrame(index=days_to_predict))
    expected_data['dayofyear'] = expected_data.index.dayofyear
    for field in input_data.columns:
        exp_df = pd.DataFrame(expected_means[field], index=range(1, 367), columns=[field+'_expected'])
        expected_data = expected_data.join(exp_df, on='dayofyear')
        expected_data[field] = expected_data[field].where(~expected_data.index.isin(list(days_to_predict)), expected_data[field+'_expected'])
    return expected_data

## Hello

So I rushed my competition submission (made the last changes literally 3 minutes before the deadline...) so I kind of bungled the implementation of my lake prediction. But I wanted to prove to myself that my model was viable, and it seems like it is if I actually do the math right and combine the components correctly.

So here it is. If you're curious about what my general model is doing (the `fit_rain_effects` function), briefly: it's a model of how rainfall on a particular day takes effect over the subsequent days, which has four physically-meaningful parameters and the emerget property of generating an $\frac{x}{e^x}$ distribution. Plus some linear regression with other arbitrary parameters thrown in on top. All fit to the data using Bayesian optimization.

Oh yes, also, when forecasting, I use a sort of smoothed average of the rain and temperature one would expect to see on that day of year as inputs to the model.

If you want more detail, you'll have to read the code or wait until I figure out what the actual rules are about publishing submissions. Sorry. Hopefully the bits I do expain are still interesting too.

## Bilancino

### Data Overview

In [None]:
bilancino=pd.read_csv('../input/acea-water-prediction/Lake_Bilancino.csv')
fix_date(bilancino)

rain_fields = ['Rainfall_S_Piero', 'Rainfall_Mangona', 'Rainfall_S_Agata', 'Rainfall_Cavallina', 'Rainfall_Le_Croci']

# The mean flow out of the lake (Flow_Rate) yesterday
bilancino['flow_mean_yesterday'] = bilancino['Flow_Rate'].rolling(2).mean()

bilancino['delta_level'] = bilancino['Lake_Level'].diff()


bilancino['temp_30'] = bilancino['Temperature_Le_Croci'].rolling(30).mean()
bilancino['temp_120'] = bilancino['Temperature_Le_Croci'].rolling(90).mean()
bilancino['temp_180'] = bilancino['Temperature_Le_Croci'].rolling(180).mean()


bilancino.columns

In [None]:
fig, axes = plt.subplots(4, figsize=(15,10))

for r in rain_fields:
    bilancino[r].rolling(120).sum().plot(ax=axes[0])
    
bilancino['Temperature_Le_Croci'].plot(ax=axes[1])

bilancino['Lake_Level'].plot(ax=axes[2])

bilancino['Flow_Rate'].plot(ax=axes[3])
    
for ax in axes:
    ax.legend()

In [None]:
train_cutoff = datetime.date(2016,1,1)
b_train = bilancino[:train_cutoff].copy()
b_test = bilancino[train_cutoff+datetime.timedelta(1):]

### 1. Dependencies and model

In [None]:
lake_graph = Digraph(graph_attr={'ranksep':'1'})

lake_graph.node('R', 'Rainfall')
lake_graph.node('T', 'Temperature')
lake_graph.node('F', 'Flow out of dam', shape='octagon', color='blue')
lake_graph.node('L', 'Lake Level', shape='octagon', color='green')

lake_graph.edge('R', 'L', color='green')
lake_graph.edge('T', 'L', color='green')
lake_graph.edge('F', 'L', color='green')
lake_graph.edge('L', 'F', color='blue')

lake_graph

The Bilancino lake is an interesting case, because its behavior depends on a man-made and human-operated structure - the dam that created the lake.

The level of water in the lake depends on the flow out of the dam, but the flow out of the dam also depends on the lake level. Specifically, the lake level determines the amount of pressure created, and therefore the strengt of the flow.

According to the challenge description, water is let out of the dam quickly at certain times, and allowed to collect at other times.

Further, the data indicates that the flow spikes drastically when the lake level goes over a certain point - This seems to be the dam's spillway being activated. According to [this site](http://cmcgruppo.com/cmc/en/project/bilancino-dam/), the spillway has an automatic flap gate. This means that the gate opens wider when pressure increases, which makes the interaction even more complicated.

#### Modeling change in lake level

In my previous experimentation, I found that two of the rainfall fields contain most of the information: `Rainfall_Mangona` and `Rainfall_Cavallina`. This makes sense: `Rainfall_Cavallina` is the closest location to the lake iteslf, and `Rainfall_Mangona` is located over the Sieve river, before the rver flows into the lake. Therefore, `Rainfall_Mangona` captures information about water which enters the lake via the river.

The new component here is how flow rate affects lake level: the amount of water that leaves the lake via the dam (or more precisely, *the amount that left yesterday*) is proportional to the subsequent reduction in lake level. So unlike the temerature parameters, **The parameters of flow in the linear regression actually have a direct physical meaning**. Specifically, the scaling coefficient tells us how to convert between flow rate and (change in) lake level.

In [None]:
b_pred_func=fit_rain_effects(
    rains_df = b_train[['Rainfall_Mangona', 'Rainfall_Cavallina']], 
    ground_truth = b_train['delta_level'], 
    additional_fields = b_train[['flow_mean_yesterday', 'Temperature_Le_Croci','temp_30', 'temp_120', 'temp_180']],
    nrandom=3,
    ntotal=10,
    verbose=False, 
    x0=[
        [0.15233691562216556, 0.8854084902188695, 0.9999999999999998, 1, 0.13666937295698817, 0.9979837155391087, 0.14202766104040032, 0],
        [0.0474258731775668, 0.9999999974825013, 0.570952205261587, 1, 0.3850443032078931, 0.45805440829961064, 0.545042159616547, 0],
    ]
)

This model predicts **lake level** based on **temperature, rainfall, and flow out of the lake**.

But when forecasting, we won't actually know the flow out of the lake - it's one of the variables we need to predict! So instead of using the returned prediction function with flow as input, we can use just move the flow effect to the other side of the equation, and predict the **cumulative change in water level as a result of weather inputs, including the water that subsequently flows out of the dam**

$$\text{Lake_Level} = C+\text{Mangona_effect}*S_\text{RM}+\text{Cavallina_effect}*S_\text{RM}+
(\sum{\text{Temp_var_i}*S_i}) - \text{Flow_Rate}*S_F
$$

$$\text{Lake_Level} +\text{Flow_Rate}*S_F =  C+\text{Mangona_effect}*S_\text{RM}+\text{Cavallina_effect}*S_\text{RM}+
(\sum{\text{Temp_var_i}*S_i})
$$

(For clarity about whether water is flowing out or in, I inverted the $S_F$ paraameter from what it is in the linear regression - so it is around 0.007 instead of -0.007)

In [None]:
flow_lake_conversion = 0.007661467849044978

# predict cumulative level delta + flow effect based on model of lake level
# (using parameters trained in the model above)
def predict_cum_delta(input_df):
    Rainfall_Mangona_prediction = pd.Series(rainfall_effect(
        input_df['Rainfall_Mangona'],
        fraction_retained = 2.220446049250313e-16, 
        first_day_flow = 1.0, 
        funnel_start_width = 191.42948720854787, 
        time_gap = 1,
    ))

    Rainfall_Cavallina_prediction = pd.Series(rainfall_effect(
        input_df['Rainfall_Cavallina'],
        fraction_retained = 0.7595424520378065, 
        first_day_flow = 0.0015911180194193453, 
        funnel_start_width = 187.22633570423756, 
        time_gap = 0,
    ))
    pred_cum_delta = 0.005862430953008586*Rainfall_Mangona_prediction+\
    1.4547492793031174*Rainfall_Cavallina_prediction+\
    0.0039183372980673425*input_df['Temperature_Le_Croci']+\
    -0.0015752030857040245*input_df['temp_30']+\
    -0.008107915323914416*input_df['temp_120']+\
    0.0048819421510388206*input_df['temp_180']+\
    -0.010936677349591556
    return pred_cum_delta, Rainfall_Mangona_prediction, Rainfall_Cavallina_prediction


In [None]:
cum_delta, rm, rc = predict_cum_delta(bilancino)

We can then compare predictions to reality by adding the true delta-level and flow rate parameter, with the flow rate scaled according to the model:

In [None]:
true_cum_delta =(bilancino['delta_level']+bilancino['flow_mean_yesterday']*flow_lake_conversion)

In [None]:
fig, ax1 = plt.subplots(figsize=(15, 3))
true_cum_delta.plot(label='True cumulative change')
cum_delta[:train_cutoff].plot(alpha=0.7, label='Predicted cumulative change(training data)')
cum_delta[train_cutoff:].plot(alpha=0.7, label='Predicted cumulative change(test data)', color='red')
plt.legend()
plt.xlim(datetime.date(2010,1,1))
plt.show()

And then we can calculate a **cumulative sum** of the true and predicted deltas to see what would happen if water kept coming in the lake, but would magically never leave via the dam:

In [None]:
fig, ax1 = plt.subplots(figsize=(15, 3))

true_cum_delta[cum_delta.first_valid_index():].cumsum().plot(label='True cumulative level')
cum_delta.cumsum().plot(label='Predicted cumulative level')
plt.legend()
plt.xlim(datetime.date(2004,1,1))
plt.title('Cumulative lake level predicted based on actual rain/temp data')
plt.show()

#### Modeling flow out of lake

As I mentioned above, the flow out of the lake is a complex process which depends on several factors, including human behavior.

I tentatively separated it into three typess of flow:
- "normal" flow
- "drain" flow, when the water is being drained from the lake through the dam's intake process
- "spillway" flow, when the lake level is high enough that the spillway is active.

My manual guess about when each happens is shown below:
(lake level is on one axis, in dashed lines, and flow rate is on anoter, in a solid line)

In [None]:
# label specific types of flow

# dates when "intake" (from the lake into the river) seems to happen
start_intake = 180# pd.to_datetime(datetime.date(2008, 7, 1)).dayofyear
end_intake = pd.to_datetime(datetime.date(2008, 11, 1)).dayofyear

bilancino.loc[(bilancino['dayofyear'] >= start_intake) & (bilancino['dayofyear'] <= end_intake), 'flow_type']='intake'

# spillway is active
# bilancino.loc[bilancino['Flow_Rate'] > 7.8, 'flow_type']='spillway'
bilancino.loc[bilancino['Lake_Level'] > 251.5, 'flow_type']='spillway'

fig, ax1 = plt.subplots(figsize=(15, 5))
ax2 = ax1.twinx()
bilancino['Lake_Level'].plot(ax=ax1, color='orange', linestyle='dashed')
bilancino.where(bilancino['flow_type']=='intake')['Lake_Level'].plot(ax=ax1, color='blue', linestyle='dashed')
bilancino.where(bilancino['flow_type']=='spillway')['Lake_Level'].plot(ax=ax1, color='red', linestyle='dashed')
bilancino['Flow_Rate'].plot(ax=ax2, color='orange', label='normal flow')
bilancino.where(bilancino['flow_type']=='intake')['Flow_Rate'].plot(ax=ax2, color='blue', label='draining')
bilancino.where(bilancino['flow_type']=='spillway')['Flow_Rate'].plot(ax=ax2, color='red', label='spillway')
plt.legend()
plt.show()


However, I found it very hard to fit parameters to expeted flow rates in each case. There seems to be a lot of non-linearity in the relationships; additionally, the relationship potentially changes in the last two years - the scatterplot below shows the relationship between flow rate and lake level; we can clearly see two greenish-yellow lines that do not conform to the pattern. these represent spillway flow after 2018 - it appears that the spillway started activating eariler. Again, human behavior makes things less predictable.

For this reason, I chose to cut off the training dataset at 2016, and focus on prediciting 2017. We can also see what happens to the subsequent years, and whether this shift changes things.

In [None]:
plt.scatter(bilancino['Lake_Level'],bilancino['Flow_Rate'], marker='x',c=bilancino['Date'].dt.year)
plt.show()

Because this is a complex and piecewise relationship, decision trees, and specifically boosted forests (aka LGBM) are a good fit.

I generated some derived features for the LGBM which capture my beliefs about the important pieces of this puzzle:
- my best guess about when draining typically starts, ends, and how it ramps up (just from eyeballing the data)
- stats about the last 60 days of lake level - because I suspect there is some inertia in the system, possibly due to the flap gate.

In [None]:
lgbm_input = bilancino[['Lake_Level']].copy() #, 'dayofyear'

 # from lake level analysis above

lgbm_input['effective_level'] = bilancino['Lake_Level']+bilancino['flow_mean_yesterday']*flow_lake_conversion
lgbm_input.drop('Lake_Level',axis=1, inplace=True)
lgbm_input['rolling_min'] = lgbm_input['effective_level'].rolling(60).min()
lgbm_input['rolling_max'] = lgbm_input['effective_level'].rolling(60).max()
lgbm_input['rolling_mean'] = lgbm_input['effective_level'].rolling(60).mean()

intake_start = 160 # Nth day in year
rampup_end = 220
intake_end = 360 # end_intake  # stop "intake" drain
intake_rampup = bilancino['dayofyear']
intake_rampup = (intake_rampup-intake_start)/(rampup_end-intake_start)
intake_rampup[(bilancino['dayofyear']<intake_start) | (bilancino['dayofyear']>intake_end)] = 0
intake_rampup[(bilancino['dayofyear']>rampup_end) & (bilancino['dayofyear']<=intake_end)] = 1

lgbm_input['intake_rampup'] = intake_rampup

lgbm_test_cutoff = datetime.date(2017, 1, 1)
lgbm_input = lgbm_input.loc[:lgbm_test_cutoff].copy()
lgbm_flow_rate = bilancino.loc[:lgbm_test_cutoff, 'Flow_Rate']


X_train = lgbm_input.loc[:train_cutoff]
y_train = lgbm_flow_rate[:train_cutoff]
X_test = lgbm_input.loc[train_cutoff:]
y_test = lgbm_flow_rate[train_cutoff:]

reg = LGBMRegressor().fit(X_train, y_train)

fig, ax1 = plt.subplots(figsize=(15, 5))

bilancino['Flow_Rate'].plot(ax=ax1)
ax1.plot(X_train.index,reg.predict(X_train), label='LGBM prediction(training data)')
ax1.plot(X_test.index,reg.predict(X_test), label='LGBM prediction(test data)')
plt.xlim(None, lgbm_test_cutoff)
plt.legend()
plt.show()

In [None]:
train_pred = pd.Series(reg.predict(X_train), index=X_train.index)
test_pred = pd.Series(reg.predict(X_test), index=X_test.index)

print('train error variance:',(train_pred-bilancino['Flow_Rate']).var(),'\ntest error variance:', (test_pred-bilancino['Flow_Rate']).var())

### 2. Forecasting

Forecasting is tricky because of the feedback loop between flow and lake level.

For this reason, I am using a custom procedure to integrate the cumulative-delta predictions and the LGBM's flow predictions:
1. predict **cumulative delta-level** for all dates
2. For each date in the dataset (in order), repeat the following:
    - take the previous date's lake level (real for day 1, predicted for the rest of the time period)
    - add the delta-level generated in step 1
    - use this "effective lake level" as input to the LGBM (it was actually trained on these values)
    - Take the flow predicted by the LGBM, and calculate the reduction in lake level (by multiplying by the conversion factor known from the rain-effect model); subtract the result from the "effective lake level" to get the actual lake level

In [None]:
exp_means, exp_vars = get_expected_inputs(
    b_train[['Rainfall_Mangona', 'Rainfall_Cavallina', 'Temperature_Le_Croci']], b_train['dayofyear'])
exp_inputs = gen_inputs_with_expectations(
    b_train[['Rainfall_Mangona', 'Rainfall_Cavallina', 'Temperature_Le_Croci']],
    b_test.index,
    exp_means
)

exp_inputs['temp_30'] = exp_inputs['Temperature_Le_Croci'].rolling(30).mean()
exp_inputs['temp_120'] = exp_inputs['Temperature_Le_Croci'].rolling(90).mean()
exp_inputs['temp_180'] = exp_inputs['Temperature_Le_Croci'].rolling(180).mean()

exp_inputs.loc[train_cutoff:,'flow_mean_yesterday'] = 0

In [None]:
pred_cum_delta, rm, rc=predict_cum_delta(exp_inputs)

In [None]:
fig, ax1 = plt.subplots(figsize=(15, 3))

true_cum_delta[cum_delta.first_valid_index():].cumsum().plot(label='True cumulative level')
pred_cum_delta.cumsum()[:train_cutoff].plot(label='Predicted cumulative level(train data)')
pred_cum_delta.cumsum()[train_cutoff:].plot(label='Predicted cumulative level(expected rain/temp)')
plt.legend()
plt.xlim(datetime.date(2004,1,1))
plt.title('Cumulative lake level predicted based on expected rain/temp')
plt.show()

In [None]:
current_lake_level = b_train.iloc[-1]['Lake_Level'] # start with last lake level in the training dataset
level_history = list(b_train.iloc[-60:]['Lake_Level']) # history so we can always get the min/max/mean of the last 60

flow_predictions = {}
level_predictions = {}

for date in b_test.index:
    # calculate LGBM inputs
    effective_lake_level = current_lake_level+pred_cum_delta[date]
    min_60 = min(level_history[-60:])
    max_60 = max(level_history[-60:])
    mean_60 = np.mean(level_history[-60:])
    intake_rampup_factor=0
    if rampup_end < date.dayofyear < intake_end:
        intake_rampup_factor = 1
    elif intake_start < date.dayofyear:
        intake_rampup_factor = (date.dayofyear-intake_start)/(rampup_end-intake_start)
    
    # predict
    pred_flow = reg.predict([[
        effective_lake_level,
        min_60,
        max_60,
        mean_60,
        intake_rampup_factor]])[0]
    pred_level = effective_lake_level -(pred_flow*flow_lake_conversion)
    
    level_predictions[date] = pred_level
    flow_predictions[date] = pred_flow
    
    # update state
    level_history.append(pred_level)
    current_lake_level = pred_level
    


In [None]:
fig, ax1 = plt.subplots(figsize=(15, 3))

b_test['Lake_Level'].plot()
pd.Series(level_predictions).plot(label='Predicted lake level')
plt.legend()
plt.title('Lake Level predictions')
plt.show()

In [None]:
print('Lake Level:')
print('MAE(all years):', mean_absolute_error(b_test['Lake_Level'],pd.Series(level_predictions)))
print('MAE(one year):', mean_absolute_error(b_test.loc[:datetime.date(2017,1,1),'Lake_Level'],pd.Series(level_predictions).loc[:datetime.date(2017,1,1)]))
print('RMSE(all years):', mean_squared_error(b_test['Lake_Level'],pd.Series(level_predictions), squared=False))
print('RMSE(one year):', mean_squared_error(b_test.loc[:datetime.date(2017,1,1),'Lake_Level'],pd.Series(level_predictions).loc[:datetime.date(2017,1,1)], squared=False))

In [None]:
fig, ax1 = plt.subplots(figsize=(15, 3))

b_test['Flow_Rate'].plot()
pd.Series(flow_predictions).plot(legend='Predicted dam flow rate')

plt.title('Flow Rate predictions')
plt.legend()
plt.show()

In [None]:
print('Flow Rate:')
print('MAE(all years):', mean_absolute_error(b_test['Flow_Rate'],pd.Series(flow_predictions)))
print('MAE(one year):', mean_absolute_error(b_test.loc[:datetime.date(2017,1,1),'Flow_Rate'],pd.Series(flow_predictions).loc[:datetime.date(2017,1,1)]))
print('RMSE(all years):', mean_squared_error(b_test['Flow_Rate'],pd.Series(flow_predictions), squared=False))
print('RMSE(one year):', mean_squared_error(b_test.loc[:datetime.date(2017,1,1),'Flow_Rate'],pd.Series(flow_predictions).loc[:datetime.date(2017,1,1)], squared=False))

We can see that the lake level predictions capture the general pattern relatively well, but are a lot smoother than reality, especially right around the rainy winter season. This is because of my choice of input for the forecasting part of the  model: historical data averaged over a week and then averaged again. Because the lake + dam system is sensitive to spikes in the rain, and because rain data tends to be so spiky, taking the average doesn't capture the behavior as well as it does for other water bodies.

In fact, we can see that the predicted flow never goes into the "spillway" behavior - precisely because spillways are *designed* in large part for mitigating sudden spikes in rain.

We also see that the LGBM tends to produce much more gradual declines in flow than in reality, whereas in the real world, the flow almost seems to switch between several different levels.

Having more information about the dam and its operation might help make a more precise model. For example, just having the dates and ramp-up procedure for water intake might help isolate that effect, and analyze the other effects in more detail.

### Using real rain/temperature
In order to see the effect of using averaged expected inputs, I re-ran the prediction with the **actual** rain and temperature values. 

In [None]:
pred_cum_delta, rm, rc = predict_cum_delta(bilancino)

In [None]:
current_lake_level = b_train.iloc[-1]['Lake_Level'] # start with last lake level in the training dataset
level_history = list(b_train.iloc[-60:]['Lake_Level']) # history so we can always get the min/max/mean of the last 60

flow_predictions = {}
level_predictions = {}

for date in b_test.index:
    # calculate LGBM inputs
    effective_lake_level = current_lake_level+pred_cum_delta[date]
    min_60 = min(level_history[-60:])
    max_60 = max(level_history[-60:])
    mean_60 = np.mean(level_history[-60:])
    intake_rampup_factor=0
    if rampup_end < date.dayofyear < intake_end:
        intake_rampup_factor = 1
    elif intake_start < date.dayofyear:
        intake_rampup_factor = (date.dayofyear-intake_start)/(rampup_end-intake_start)
    
    # predict
    pred_flow = reg.predict([[
        effective_lake_level,
        min_60,
        max_60,
        mean_60,
        intake_rampup_factor]])[0]
    pred_level = effective_lake_level -(pred_flow*flow_lake_conversion)
    
    level_predictions[date] = pred_level
    flow_predictions[date] = pred_flow
    
    # update state
    level_history.append(pred_level)
    current_lake_level = pred_level
    


In [None]:
fig, ax1 = plt.subplots(figsize=(15, 3))

b_test['Lake_Level'].plot()
pd.Series(level_predictions).plot(label='Predicted lake level')
plt.legend()
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(15, 5))

b_test['Flow_Rate'].plot()
bilancino.where(bilancino['flow_type']=='spillway').loc[train_cutoff:,'Flow_Rate'].plot(color='red', label='Flow Rate (spillway)')
pd.Series(flow_predictions).plot(alpha=0.8, label='Predicted flow rate')
plt.legend()
plt.show()

The shape of the predictions is much closer to reality now, and the flow predictions even produce plausible spillway spikes in mostly the right places. 

There are still a couple of interesting discrepancies, mostly with the flow spillway spikes. In the graph above, I highlighted in red the parts of flow which should correspond with the spillway being active (the same as the red parts on the flow + lake level graph at the beginning of the "modeling flow" section). These are sections where the lake level is above a certain level, above which the spillway seems to activate according to the trainign data. The vertical part of the hockey stick on the flow vs. level scatter plot.

The LGBM's decision on when to spike the flow matches my lake level-based prediction pretty well; in fact, there are  a couple of places  where my red highlighting and the LGBM agree, even though the LGBM did not "know" about my heuristic, but the data disagrees:

- Most prominently, right around the start of 2018 and also the end of 2019, there are spikes in real data which neither the LGBM nor my Lake Level based heuristic anticipated. These correspond to the outliers I pointed out on the scatter plot: In these years, the spillway seems to be behaving differently than in the training data.
- There are also a couple of small places where the opposite happens: the spillway ought to be active according to my heuristic, and the LGBM predicts a relatively high flow, but in reality the flow does not spike. The most noteable one is around early 2017, where the LGBM created a very narrow spike exactly where my heuristic highlighted a very tiny red spot.

But overall, having realistic rain predictions allows the model to match the real behavior much closer. For this particular water body, it might make sense to come up with a different way of capturing expected rain which doesn't rely on smoothing; for exampe, use the known means and variances of rain on a given day to actually generate random rain data. Though that would still not capture patters where rainy days are often clumped together, producing an extended spike of inflowing water. And of course, any such approximation wouldn't be able to predict *exactly when* the spikes would happen without making actual weather predictions.