In [1]:
from merlion.utils import TimeSeries
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
from data.starting_kit.ts_split import GroupedTimeSeriesSplit
from merlion.models.defaults import DefaultForecasterConfig, DefaultForecaster
from datetime import datetime
from merlion.evaluate.forecast import ForecastMetric
from merlion.models.factory import ModelFactory
from merlion.models.ensemble.combine import ModelSelector
from merlion.evaluate.forecast import ForecastMetric

In [None]:
df = pd.read_csv('../data/starting_kit/train.csv')
# drop index for feature preparation
df_ = df.drop(columns='pseudo_id')
df.tail()

In [3]:

# convert dates to pandas datetime
df_.columns = [datetime.strptime(c, "%Y-%m-%d %H:%M:%S") for c in df_.columns]
# Aggregate energy use values per day

df_ = df_.T.resample("H").sum()
# Set dates for development phase
new_date_range = pd.date_range(start="2017-01-01", end="2019-09-05", freq="H")
# Add test dates in the data frame
df_ = df_.reindex(new_date_range, fill_value = 0) # using dummy values in test set
# df_ = df_.T
df_.index = pd.to_datetime(df_.index)

In [None]:
df_.tail()

In [5]:
tscv = GroupedTimeSeriesSplit(train_window= 912, test_window=168, train_gap = 0, freq="H")

In [6]:
### Define models to be used in the ensemble
max_target_seq_index = df_.shape[1]
max_forecast_steps = 168
models = []

In [None]:
for target_seq_index in range(0, max_target_seq_index):
    print("Dwelling:", target_seq_index)
    kwargs = dict(max_forecast_steps=max_forecast_steps, target_seq_index=target_seq_index)

    model1 = ModelFactory.create("DefaultForecaster", **kwargs)

    model2 = ModelFactory.create("Arima", **kwargs)

    # This ModelSelector combiner picks the best model based on sMAPE
    model3 = ModelFactory.create("ForecasterEnsemble", models=[model1, model2],
                                     combiner=ModelSelector(metric=ForecastMetric.sMAPE))

    localModels = [model1, model2, model3]
    split = 0
    for train_ind, test_ind in tscv.split(df_, y=df_, dates = df_.index):
        print("** Split:", split)
        split += 1

        ### Prepare data for training
        train_ind_all = range(train_ind[-1]+1)
        X_train_df = df_.iloc[train_ind_all].copy()
        X_train = TimeSeries.from_pd(X_train_df)

        ### Prepare data for testing
        X_test_df = df_.iloc[test_ind].copy()
        X_test = TimeSeries.from_pd(X_test_df)
        target_univariate = X_test.univariates[X_test.names[target_seq_index]]
        target = target_univariate.to_ts()

        model = model3
        train_pred, train_stderr = model.train(X_train)

        forecast, stderr = model.forecast(target.time_stamps)
        forecast = DataFrame(forecast.to_pd())
        forecast = forecast.values.reshape(max_forecast_steps)
        df_.iloc[test_ind,target_seq_index] = forecast

In [8]:
df_.to_csv('./out/hours.csv', index=False)

In [None]:
new_date_range = pd.date_range(start="2019-08-01", end="2017-10-01", freq="H")
df_year_17 = df_.reindex(new_date_range)
df_year_17.loc[: , 0].plot(figsize=(12,6))