In [None]:
import fastai

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import torch
from sklearn.metrics import mean_squared_error as sk_mse

from fastai.learner import Learner
from fastai.metrics import rmse
from fastai.tabular.model import get_emb_sz
from fastai.tabular.all import *

from dies.data import tp_from_df, get_y_ranges, split_by_year, create_consistent_number_of_sampler_per_day, TimeseriesTransform
from dies.utils_pytorch import xavier_init_uniform
from dies.autoencoder import Autoencoder
from dies.losses import CnnMSELoss
from dies.data import *
from dies.embedding import EmbeddingModule
from dies.data import split_by_year

%load_ext autoreload
%autoreload 2
sns.set_context('poster')
sns.set_style('whitegrid')
mpl.rcParams["legend.loc"] = 'upper right'

In [None]:
with pd.HDFStore('./data/GEFCOM_Z1.h5') as store:
    df = store['powerdata']

In [None]:
df.head(2)

Drop columns with na values

In [None]:
df = df.fillna(df.mean())

In [None]:
df = df.dropna(axis=1)

In [None]:
df.head(2)

Create proper timestamp

In [None]:
df.TimeUTC = pd.to_datetime(df.TimeUTC, infer_datetime_format=True, utc=True)
df.set_index('TimeUTC', inplace=True)

In [None]:
df.PowerGeneration = df.PowerGeneration.apply(float) / df.MaxPowerGeneration.apply(float)
df.drop('MaxPowerGeneration', axis=1, inplace=True)

Create features for timeseries

In [None]:
df['DayOfYear'] = df.index.dayofyear
df['Hour'] = df.index.hour

In [None]:
df.columns

In [None]:
cols = [ 'WindDirectionMeridional100m',
       'WindDirectionMeridional10m', 'WindDirectionZonal100m',
       'WindDirectionZonal10m', 'WindSpeed100m', 'WindSpeed10m']

cat_cols = ['DayOfYear', 'Hour']

df_train, df_test  = split_by_year(df, year="2013")

tp = tp_from_df(df_train, y_columns=cols, x_columns=cols, 
                cat_columns = cat_cols, 
                standardize_X=True,
                valid_percent=0.2)

Assure correct columns were selected. In case `x` and `y` are the same, we append `_target` for better separation.

In [None]:
tp.all_col_names

Create dataloader for fastai training

In [None]:
dl = tp.dataloaders(bs=128)

Create AE Model

In [None]:
input_size = tp.conts.shape[1]
sizes_cat = [367, 25]
ann_structure = [input_size, 50, 10, input_size]

In [None]:
embedding_module = EmbeddingModule(sizes_cat, embedding_dropout=0.1)

ann_model = Autoencoder(
    ann_structure=ann_structure,
    embedding_module=embedding_module,
    embeding_position="start",
    y_ranges=get_y_ranges(dl.train_ds.ys),
)

 Train it

In [None]:
learn = Learner(
    dl,
    ann_model,
    loss_func=torch.nn.MSELoss(),
)

In [None]:
learn.fit_one_cycle(20)

Evaluation...Note, that through `decode` we can reverse the scaling of the input data. See `get_decode_ouput_from_pred` for details.

In [None]:
learn.recorder.plot_loss()

In [None]:
from fastai.tabular.all import *

In [None]:
def get_decode_ouput_from_pred(learn, df_test, column):
    df_test = df_test.copy()
    test_dl = learn.dls.test_dl(df_test)
    tmp = learn.get_preds(dl=test_dl, 
                          with_input=True, with_decoded=True)
    inp = tmp[0]
    pred = tmp[1]
    
    for idx in range(len(learn.dls.train_ds.cont_names)):
        inp[1][:,idx] = pred[:, idx]
    
    inp = learn.dls.decode(inp)
    decoded_output = inp.decode()
    decoded_output = decoded_output.items
    decoded_output.index=df_test.index
    
    return decoded_output

In [None]:
decoded_output = get_decode_ouput_from_pred(learn, df_test,
                                          "WindSpeed100m")

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(decoded_output.WindSpeed100m, 
            df_test.WindSpeed100m)

plt.xlabel('Wind speed [m/s²] Target')
plt.ylabel('Wind speed [m/s²] Prediction')

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df_test.WindSpeed100m, label='Target')
plt.plot(decoded_output.WindSpeed100m, label='Prediction')
plt.xlabel('Date')
plt.ylabel('Wind speed [m/s²]')
plt.legend()

In [None]:
print(f'Test RMSE for WindSpeed: {sk_mse(df_test.WindSpeed100m, decoded_output.WindSpeed100m, squared=False)}')

The large error might be due to the encoding and decoding on the test data.