In [None]:
import fastai
fastai.__version__

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import torch
from sklearn.metrics import mean_squared_error as sk_mse

from fastai.learner import Learner
from fastai.metrics import rmse
from fastai.learner import *
from fastai.tabular.all import *
from dies.data import tp_from_df_from_dtypes, get_y_ranges, split_by_year, split_by_n_weeks #ds_from_df_from_dtypes,  scale_datasets, create_databunch
from dies.mlp import MultiLayerPerceptron
from dies.embedding import Embedding
from dies.utils_pytorch import dev_to_np, xavier_init_uniform
from fastai.tabular.model import *
from dies.embedding import *
from fastai.tabular.all import *

sns.set_context('poster')
sns.set_style('whitegrid')
mpl.rcParams["legend.loc"] = 'upper left'

In [None]:
with pd.HDFStore('./data/GEFCOM_Z1.h5') as store:
    df = store['powerdata']

In [None]:
df.head(2)

Drop columns with na values

In [None]:
df = df.fillna(df.mean())

In [None]:
df = df.dropna(axis=1)

In [None]:
df.head(2)

Create proper timestamp

In [None]:
df.TimeUTC = pd.to_datetime(df.TimeUTC, infer_datetime_format=True, utc=True)
df.set_index('TimeUTC', inplace=True)

Normalize output

In [None]:
df.PowerGeneration = df.PowerGeneration.apply(float) / df.MaxPowerGeneration.apply(float)
df.drop('MaxPowerGeneration', axis=1, inplace=True)

Create features for timeseries

In [None]:
df['DayOfYear'] = df.index.dayofyear
df['Hour'] = df.index.hour

In [None]:
df.dtypes

In [None]:
start_date = np.min(df.index)
end_date = np.max(df.index)
every_n_weeks = 4
for_n_weeks = 1

In [None]:
sw = pd.date_range(
        start_date, end_date, freq=f"{int(every_n_weeks*7)}D", normalize=True)
ew = pd.date_range(
    start_date + pd.DateOffset(days=int(for_n_weeks * 7)),
    end_date,
    freq=f"{(every_n_weeks)*7}D",
    normalize=True)

In [None]:
mask = np.zeros(len(df.index), dtype=np.bool)

In [None]:
for s, e in zip(sw, ew):
    print(mask.tolist())
    mask = mask | ((df.index > s) & (df.index < e))

In [None]:
mask

In [None]:
splits = MaskSplitter(mask)(range_of(df))

In [None]:
splits

In [None]:
procs = [FillMissing]
procs += [Categorify]
cont_names = ['WindDirectionMeridional100m','WindDirectionMeridional10m','WindDirectionZonal100m','WindDirectionZonal10m','WindSpeed100m','WindSpeed10m']
cat_names = ['DayOfYear','Hour']
y_names = ['PowerGeneration']

In [None]:
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, splits=splits)

In [None]:
to.valid

In [None]:
len(df)

In [None]:
mask = df.resample("D").apply(len).PowerGeneration
mask = (mask < 24) & (mask > 0)

In [None]:
for i in range(len(mask)):
    if mask[i]:
        print(mask.index[i])
        new_day = mask.index[i] + pd.DateOffset(days=1)
        print(new_day)
        new_day.hours = 0

        cur_mask = (df.index < mask.index[i]) | (df.index >= new_day)
        df = df[cur_mask]

In [None]:
to.cats

In [None]:
cats = tensor(
                to.cats.values.reshape(
                    -1, 24, to.cats.shape[1]))
cats[0]

In [None]:
cats = cats.permute(0, 2, 1)
cats[4]

In [None]:
cats.shape

In [None]:
to.procs.decode(to)

In [None]:
np.array(to.train.ys)

In [None]:
df_train, df_test = split_by_year(df, "2013")

In [None]:
tp = tp_from_df_from_dtypes(df_train, "PowerGeneration", 
                            valid_percent=None, 
                            do_split_by_n_weeks=True,
                            every_n_weeks=5,
                            for_n_weeks=1,
                            standardize_X=True)

Assure correct columns were selected

In [None]:
tp.cont_names, tp.cat_names,  tp.y_names

We can use y_ranges to scale the output in the area of the output. Here, it doesn't make much sense, as the output is already normalized, and it is rather for demonstrating functionality.

In [None]:
y_ranges = get_y_ranges(tp.ys);y_ranges

Create data loaders

In [None]:
dls = tp.dataloaders(bs=256)

Create MLP Model

In [None]:
input_size = dls.train_ds.conts.shape[1]

One can get suggestiopns from fastai for embedding size. First dimension is the maximum categorical value, second is the emebdding size.

In [None]:
def get_model():
    emb_sizes = get_emb_sz_list([367, 25])

    ann_model = MultiLayerPerceptron([input_size, 12, 6, 3, 1], emb_sizes,
                        y_range=y_ranges[0], ps=0.1)
    
    return ann_model

In [None]:
ann_model = get_model()

Train it

In [None]:
learn = Learner(
    dls,
    ann_model,
    loss_func=torch.nn.MSELoss(),
    metrics=[rmse]
)
# you can add custom callbacks here, if required
cbs=[]

In [None]:
learn.summary()

In [None]:
learn.lr_find()

In [None]:
# cosine annealing for learning rate
learn.fit_one_cycle(20, lr_max=5e-2)

In [None]:
learn.recorder.plot_loss()

In [None]:
learn.lr_find()

In [None]:
learn.fit(30, lr=1e-4)

In [None]:
# ds_idx=1, refers to valid dataset
preds, targets = learn.get_preds(ds_idx=1)
print(f'Valid nRMSE: {sk_mse(to_np(targets).ravel(), to_np(preds).ravel())**0.5}')

In [None]:
dl = learn.dls.test_dl(df_test)

In [None]:
dl.show_batch()

In [None]:
preds, _ = learn.get_preds(dl=dl)
preds = dev_to_np(preds)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(dl.loc[:,"WindSpeed100m"], dl.loc[:,"PowerGeneration"], label='Target')
plt.scatter(dl.loc[:,"WindSpeed100m"], preds, label='Prediction')
plt.xlabel('Wind speed [m/s²]')
plt.ylabel('Normalized Power')
plt.legend()

In [None]:
print(f'Test nRMSE: {sk_mse(df_test.PowerGeneration, preds)**0.5}')

Now lets create a validation dataset, where every nth week is validation data.

In [None]:
tp = tp_from_df_from_dtypes(df_train, "PowerGeneration", 
                            valid_percent=0.2, 
                            standardize_X=True)

In [None]:
dls = tp.dataloaders(bs=16)

In [None]:
learn = Learner(
    dls,
    get_model(),
    loss_func=torch.nn.MSELoss(),
    metrics=[rmse]
)
learn.fit(1)
learn.fit_one_cycle(2)

In [None]:
preds, _ = learn.get_preds(dl=dl)
preds = dev_to_np(preds)
print(f'Test nRMSE: {sk_mse(df_test.PowerGeneration, preds)**0.5}')

We also can use the default MLP model of fastai.

In [None]:
learn = tabular_learner(dls, metrics=rmse)

In [None]:
for idx, l in enumerate(learn.model.layers):
    print(idx, l)
    l.requires_grad_ = False

In [None]:
learn.summary()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, lr_max=5e-3)

In [None]:
dl = learn.dls.test_dl(df_test)
preds, _ = learn.get_preds(dl=dl)
preds = dev_to_np(preds)
print(f'Test nRMSE: {sk_mse(df_test.PowerGeneration, preds)**0.5}')