In [1]:
from merlion.utils import TimeSeries
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
from data.starting_kit.ts_split import GroupedTimeSeriesSplit
from merlion.models.defaults import DefaultForecasterConfig, DefaultForecaster
from datetime import datetime
from merlion.evaluate.forecast import ForecastMetric
from merlion.models.factory import ModelFactory
from merlion.models.ensemble.combine import ModelSelector
from merlion.evaluate.forecast import ForecastMetric

In [2]:
df = pd.read_csv('../data/starting_kit/train.csv')
# drop index for feature preparation
df_ = df.drop(columns='pseudo_id')
# convert dates to pandas datetime
df_.columns = [datetime.strptime(c, "%Y-%m-%d %H:%M:%S") for c in df_.columns]
# Aggregate energy use values per day

df_ = df_.T.resample("H").sum()
# Set dates for development phase
new_date_range = pd.date_range(start="2017-01-01", end="2019-03-31", freq="H")
# Add test dates in the data frame
df_ = df_.reindex(new_date_range, fill_value = 0) # using dummy values in test set
# df_ = df_.T
df_.index = pd.to_datetime(df_.index)

In [3]:
df_.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
2017-01-01 00:00:00,85.008,4.572,23.6665,104.853,55.043,21.67,8.517,50.1745,74.895,22.242,...,0.368,11.43,9.319,1.745,5.848,1.232,3.041,6.9575,2.0065,6.693
2017-01-01 01:00:00,71.3175,4.533,24.5725,90.228,53.691,20.802,9.665,47.1655,63.5045,21.32,...,0.68,11.414,9.95,1.807,8.823,0.669,2.868,5.615,1.41,5.698
2017-01-01 02:00:00,74.411,5.788,24.725,94.137,50.388,20.309,10.97,51.7245,62.577,23.942,...,1.4,14.907,8.828,1.65,8.535,0.645,2.892,6.6505,1.5015,5.953
2017-01-01 03:00:00,72.9065,5.674,24.897,96.276,54.27,19.099,9.228,49.304,64.9425,21.885,...,0.803,12.92,8.203,1.362,5.721,1.418,3.19,7.4195,1.25,6.856
2017-01-01 04:00:00,32.8025,2.528,11.556,42.68,23.174,8.557,3.721,23.951,26.9285,10.102,...,0.205,6.317,3.524,0.869,2.667,0.303,1.803,3.766,0.7975,1.632


In [4]:
tscv = GroupedTimeSeriesSplit(train_window= 912, test_window=168, train_gap = 0, freq="H")

In [5]:
### Define models to be used in the ensemble
max_target_seq_index = 1 ##df_.shape[1]
max_forecast_steps = 168
models = []

In [None]:
for target_seq_index in range(0, max_target_seq_index):
    print("Dwelling:", target_seq_index)
    kwargs = dict(max_forecast_steps=max_forecast_steps, target_seq_index=target_seq_index)

    model1 = ModelFactory.create("DefaultForecaster", **kwargs)

    model2 = ModelFactory.create("Arima", **kwargs)

    # This ModelSelector combiner picks the best model based on sMAPE
    model3 = ModelFactory.create("ForecasterEnsemble", models=[model1, model2],
                                     combiner=ModelSelector(metric=ForecastMetric.sMAPE))

    localModels = [model1, model2, model3]
    split = 0
    for train_ind, test_ind in tscv.split(df_, y=df_, dates = df_.index):
        print("** Split:", split)
        split += 1

        ### Prepare data for training
        train_ind_all = range(train_ind[-1]+1)
        X_train_df = df_.iloc[train_ind_all].copy()
        X_train = TimeSeries.from_pd(X_train_df)

        ### Prepare data for testing
        X_test_df = df_.iloc[test_ind].copy()
        X_test = TimeSeries.from_pd(X_test_df)
        target_univariate = X_test.univariates[X_test.names[target_seq_index]]
        target = target_univariate.to_ts()

        model = model3
        train_pred, train_stderr = model.train(X_train)

        forecast, stderr = model.forecast(target.time_stamps)
        forecast = DataFrame(forecast.to_pd())
        forecast = forecast.values.reshape(max_forecast_steps)
        df_.iloc[test_ind,target_seq_index] = forecast

Dwelling: 0
** Split: 0


In [None]:
df_.to_csv('out-hours.csv', index=False)

In [None]:
new_date_range = pd.date_range(start="2017-01-01", end="2017-07-01", freq="H")
df_year_17 = df_.reindex(new_date_range)
df_year_17.loc[: , 47].plot(figsize=(12,6))