# DataLoader:


- should be able to handle: real data, categorical data.
- pytorch-forecasting: batching only by sequence length => lacks batching by time interval (e.g. "get all time steps within the last 60 minutes")

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Timestamp, Timedelta, DatetimeIndex
from pandas.tseries.offsets import DateOffset

pd.set_option('display.max_rows', 5)

In [3]:
from tsdm.datasets import Electricity

X = Electricity.dataset
X

In [4]:
from tsdm.util import time2int, make_dense_triplets

df = make_dense_triplets(X).reset_index()
df

In [5]:
split_dates = [Timestamp("2014-09-01"), Timestamp("2014-03-31"), X.index[-1]-DateOffset(days=7)]
assert Series(split_dates).isin(X.index).all()
split = split_dates[-1]

X_TRAIN = X.loc[:split].copy()
X_TEST  = X.loc[split:].copy()
X_TRAIN

# Pre-processing

## Option 1: aggregation via sum /mean

In [6]:
X_TRAIN.resample('1H').sum()
X_TEST.resample('1H').mean()

## Option 2: Normalization

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
X_TRAIN = X.loc[:split].copy()
X_TEST  = X.loc[split:].copy()

scaler = MinMaxScaler()
scaler.fit(X_TRAIN)
X_TRAIN.loc[:] = scaler.transform(X_TRAIN)
X_TEST.loc[:] = scaler.transform(X_TEST)
X_TRAIN

In [8]:
h_train = np.timedelta64(7, 'D')
h_pred = np.timedelta64(24, 'h')

lower = X.index[0]
upper = X.index[-1] - h_train - h_pred
mask = (X.index >= lower) & (X.index <= upper)
t_range = X.index[mask]
BATCHSIZE = 3

ini = np.random.choice(t_range)
mid = ini + h_train
end = ini + h_train + h_pred
train_mask = (X.index >= ini) & (X.index < mid)
train_time = X.index[train_mask]
train_data = X.loc[train_time]
valid_mask = (X.index >= mid) & (X.index < end)
valid_time = X.index[valid_mask]
valid_data = X.loc[valid_time]