In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sn
import matplotlib.pyplot as plt

from fastai.tabular.all import *


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About

Let's try rolling average as feature to enhance lake level prediction.

# Baseline
Build a baseline with the original features.

In [None]:
TARGETS = ['Lake_Level', 'Flow_Rate']

lake_data = pd.read_csv('../input/acea-water-prediction/Lake_Bilancino.csv')
add_datepart(lake_data, 'Date', drop = False)
lake_data['Elapsed'] = lake_data['Elapsed'].astype('int64')

FEATURES = list(set(lake_data.columns.tolist() + ['Month','Week','Day','Dayofyear']) - set(TARGETS)) 

Predict Lake_Level with a plain tabular neural net.

Take the last 370 entries of the time series as validation set.

In [None]:
splits = IndexSplitter(lake_data.index[-370:])(range_of(lake_data))

dls = TabularPandas(lake_data, procs=[Categorify, FillMissing,Normalize],
                   cont_names = list(set(FEATURES)-set(['Date'])),
                   y_names=TARGETS[0],
                   splits=splits).dataloaders(bs=128)

learn = tabular_learner(dls, layers=[24,8,24], metrics=[rmse, mae], y_range=[242, 254])#, loss_func = F.l1_loss)

learn.fit_one_cycle(10, 1e-2)

Plot predictions of validation set vs. ground truth.

In [None]:
plt.plot(learn.get_preds(ds_idx=1)[0].numpy())
plt.plot(lake_data[TARGETS[0]].values[-370:])

# Generate rolling features

In [None]:
TARGETS = ['Lake_Level', 'Flow_Rate']

lake_data = pd.read_csv('../input/acea-water-prediction/Lake_Bilancino.csv')

FEATURES = list(set(lake_data.columns) - set(TARGETS + ['Date']))

roll720 = lake_data.copy()[FEATURES].rolling(20).mean()
roll720.columns = [c + 'roll720' for c in FEATURES]

roll360 = lake_data.copy()[FEATURES].rolling(360, win_type='triang').mean()
roll360.columns = [c + 'roll360' for c in FEATURES]

lake_data = pd.concat([roll720, roll360, lake_data], axis = 1).dropna().reset_index(drop = True)

FEATURES_WO_DATE = lake_data.columns.tolist()

add_datepart(lake_data, 'Date', drop = False)
lake_data['Elapsed'] = lake_data['Elapsed'].astype('int64')


FEATURES = list(set(FEATURES_WO_DATE + ['Month','Week','Day','Dayofyear']) - set(TARGETS)) #

lake_data.head()

Comparing rolling features with original features in correlation matrix.

In [None]:
corr_lake = lake_data[FEATURES + TARGETS].corr()

fig, ax = plt.subplots(figsize=(15,12)) 
sn.heatmap(corr_lake, annot=True, ax = ax)
plt.show()

=> roll360 features have a correlation between 0.4 and 0.5. That's not very strong but far higher than the original features (approx +/-0.3).

Predict Lake_Level with a plain tabular neural net.

Once again, take the last 370 entries of the time series as validation set.

In [None]:
splits = IndexSplitter(lake_data.index[-370:])(range_of(lake_data))

dls = TabularPandas(lake_data, procs=[Categorify, FillMissing,Normalize],
                   cont_names = list(set(FEATURES)-set(['Date'])),
                   y_names=TARGETS[0],
                   splits=splits).dataloaders(bs=128)

learn = tabular_learner(dls, layers=[24,8,24], metrics=[rmse, mae], y_range=[242, 254])#, loss_func = F.l1_loss)

learn.fit_one_cycle(10, 1e-2)

Plot predictions of validation set vs. ground truth.

In [None]:
plt.plot(learn.get_preds(ds_idx=1)[0].numpy())
plt.plot(lake_data[TARGETS[0]].values[-370:])

1. There is a slight improvement. 