# DataLoader:


- should be able to handle: real data, categorical data.
- pytorch-forecasting: batching only by sequence length => lacks batching by time interval (e.g. "get all time steps within the last 60 minutes")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import torch
from pandas import DataFrame, Series, Timestamp
from pandas.tseries.offsets import DateOffset
from torch import Tensor

pd.set_option("display.max_rows", 5)

In [None]:
from tsdm.datasets import Electricity

X = Electricity.dataset
X

In [None]:
from tsdm.utils.converters import time2float, time2int

time2float(X.index), time2int(X.index)

In [None]:
from tsdm.utils import is_quasiregular, is_regular, regularity_coefficient, time_gcd

print(f"{is_regular(X.index)=}")
print(f"{is_quasiregular(X.index)=}")
print(f"{regularity_coefficient(X.index)=}")
print(f"{time_gcd(X.index)=}")

In [None]:
static_categoricals: list[str] = ([],)
static_reals: list[str] = ([],)
time_varying_known_categoricals: list[str] = ([],)
time_varying_known_reals: list[str] = ([],)
time_varying_unknown_categoricals: list[str] = ([],)
time_varying_unknown_reals: list[str] = []

In [None]:
x = np.random.randint(low=-5, high=+5, size=(3, 3, 4, 5))
mask = np.random.choice([False, True], x.shape)

In [None]:
ds = Series(x.flatten()).astype(pd.Int64Dtype())
ds = ds.where(mask.flatten())
ds = ds.astype("category")
ds

In [None]:
pd.get_dummies(ds, sparse=True, dtype=float)

In [None]:
pd.NA

In [None]:
metadata: dict[str, Tensor]

In [None]:
from tsdm.utils.converters import make_dense_triplets, time2int

df = make_dense_triplets(X).reset_index()
df

In [None]:
split_dates = [
    Timestamp("2014-09-01"),
    Timestamp("2014-03-31"),
    X.index[-1] - DateOffset(days=7),
]
assert Series(split_dates).isin(X.index).all()
split = split_dates[-1]

X_TRAIN = X.loc[:split].copy()
X_TEST = X.loc[split:].copy()
X_TRAIN

# Pre-processing

## Option 1: aggregation via sum /mean

In [None]:
X_TRAIN.resample("1H").sum()
X_TEST.resample("1H").mean()

## Option 2: Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

X_TRAIN = X.loc[:split].copy()
X_TEST = X.loc[split:].copy()

scaler = MinMaxScaler()
scaler.fit(X_TRAIN)
X_TRAIN.loc[:] = scaler.transform(X_TRAIN)
X_TEST.loc[:] = scaler.transform(X_TEST)
X_TRAIN

In [None]:
h_train = np.timedelta64(7, "D")
h_pred = np.timedelta64(24, "h")

lower = X.index[0]
upper = X.index[-1] - h_train - h_pred
mask = (X.index >= lower) & (X.index <= upper)
t_range = X.index[mask]
BATCHSIZE = 3

ini = np.random.choice(t_range)
mid = ini + h_train
end = ini + h_train + h_pred
train_mask = (X.index >= ini) & (X.index < mid)
train_time = X.index[train_mask]
train_data = X.loc[train_time]
valid_mask = (X.index >= mid) & (X.index < end)
valid_time = X.index[valid_mask]
valid_data = X.loc[valid_time]

In [None]:
class ContinuousTimeSeriesDataset(Dataset):

In [None]:
from typing import Union

from numpy import timedelta64
from pandas import DataFrame
from torch.utils.data import DataLoader, Dataset


class ContinuousTimeSeriesDataset(Dataset):
    def __init__(
        self,
        df: DataFrame,
        forecast_horizon: Union[timedelta64, int],
        observed_horizon: Union[timedelta64, int],
        dtype=torch.float32,
        device=torch.device("cpu"),
        pack_irregular_timeseries: bool = False,
    ):
        assert df.index.is_monotonic_increasing, "Index not sorted!"
        if not df.index.is_monotonic_increasing:
            df = df.sort_index(ascending=True)

        self.κ = regularity_coefficient(df.index)
        self.gcd = time_gcd(df.index)
        self.is_regular = is_regular(df.index)

        print(f"Time Series is regular :  {self.is_regular}")
        print(f"Regularity Coefficient :  {self.κ}")
        print(f"Greatest Common Divisor:  {self.gcd}")

        assert np.issubdtype(df.index.dtype, np.datetime64), "This doesn't look"

        self.h_obs = observed_horizon
        self.h_pre = forecast_horizon
        self.time = df.index
        self.T = torch.tensor(time2float(df.index), dtype=dtype)
        self.X = torch.tensor(df.values, dtype=dtype)

    #         self.lower = df.index[ 0] + observed_horizon
    #         self.upper = df.index[-1] - forecast_horizon

    #         assert self.lower <= self.upper, "The horizon is larger than the time range!"
    # uniform sampler on interval

    def __len__(self):
        return len(self.T)

    def __getitem__(self, idx):
        return self.T[idx], self.X[idx]

In [None]:
DS = ContinuousTimeSeriesDataset(X, forecast_horizon=5, observed_horizon=5)

In [None]:
train_loader = DataLoader(DS, batch_size=50)

In [None]:
next(iter(train_loader))

In [None]:
class FixedStepTimeSeriesDataset(Dataset):
    def __init__(self, df: DataFrame,  
                 forecast_horizon: timedelta64,
                 observed_horizon: timedelta64,
                 dtype = torch.float32,
                 device = torch.device('cpu')):
        
        
        assert np.issubdtype(df.index.dtype, np.datetime64), "This doesn't look"
        assert df.index.is_monotonic_increasing, "Index not sorted!" 
        if not df.index.is_monotonic_increasing
            df = df.sort_index(ascending=True)
        
        ΔT = np.diff(df.index)
        Δt = ΔT[0]
        assert np.all(ΔT == Δt), "Time Series irregular!"
        
        forecast_steps = timedelta64//
        observed_steps = 
        
        self.h_obs = observed_horizon
        self.h_pre = forecast_horizon
        self.time = pandas.Series(df.index)
        self.T = torch.tensor(time2float(df.index), dtype=dtype)
        self.X = torch.tensor(df.values, dtype=dtype)
        
        self.lower = df.index[ 0] + observed_horizon
        self.upper = df.index[-1] - forecast_horizon
        
        assert self.lower <= self.upper, "The horizon is larger than the time range!"

        min_index = T[T >= T.iloc[ 0] + observed_horizon].index.min()
        max_index = T[T <= T.iloc[-1] - forecast_horizon].index.max()
        
        self.time_range = T[(T >= T.iloc[ 0] + observed_horizon) & (T <= T.iloc[-1] - forecast_horizon)]
        
    def __len__(self):
        return len(self.time_range)
    
    def __getitem__(self, idx):
        idx = self.time_range.index[idx]
        
        
        return self.T[idx], self.X[idx]
        

In [None]:
T = pandas.Series(df.index)
X = df.values

In [None]:
forecast_horizon = np.timedelta64(125, "h")

In [None]:
T[(T >= T.iloc[0] + observed_horizon) & (T <= T.iloc[-1] - forecast_horizon)].index[0]

In [None]:
T[T <= T.iloc[-1] - forecast_horizon].index.max()

In [None]:
T.index

In [None]:
T.diff(2)

In [None]:
T[(T <= T.iloc[-1] - np.timedelta64(125, "h") - np.timedelta64(125, "h"))].index

In [None]:
import pandas

pandas.Series(T)

In [None]:
(T >= T[0] + np.timedelta64(125, "h")).argmax()

In [None]:
T[500]

In [None]:
(T < T[-1] - np.timedelta64(125, "h"))[::].argmax()

In [None]:
(T < T[-1] - np.timedelta64(125, "h")).sum()

In [None]:
T[139755]

In [None]:
from torch.utils.data import DataLoader, Dataset

In [None]:
class TimeseriesDataset(torch.utils.data.Dataset):   
    def __init__(self, df: DataFrame
                 forecast_horizon: timedelta64,
                 observed_horizon: timedelta64,
                ):

        assert np.issubdtype(df.index.dtype, np.datetime64), "This doesn't look like a time series"

        if not df.index.is_monotonic_increasing
            warnings.warn("Index not sorted! Sorting..." )
            df = df.sort_index(ascending=True)

        self.time = df.index
        self.T = torch.tensor(time2float(df.index), dtype=dtype)
        self.X = torch.tensor(df.values, dtype=dtype)
        
        self.h_obs = observed_horizon
        self.h_pre = forecast_horizon
        
        self.lower = df.index[ 0] + observed_horizon
        self.upper = df.index[-1] - forecast_horizon
        
        
        
        
    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])