In [10]:
import warnings
import importlib

warnings.filterwarnings("ignore")

from pathlib import Path
from typing import List, Optional, Union

import numpy as np
import pandas as pd

from tsururu.dataset import IndexSlicer, Pipeline, TSDataset
from tsururu.model_training.validator import HoldOutValidator
from tsururu.models.torch_based.dlinear import DLinear_NN
from tsururu.strategies import MIMOStrategy
from tsururu.transformers import (LagTransformer, SequentialTransformer,
                                  TargetGenerator, UnionTransformer)

from tsururu.model_training.trainer import DLTrainer

In [11]:
def get_results(
    cv: int,
    regime: str,
    y_true: Optional[List[np.ndarray]] = None,
    y_pred: Optional[List[np.ndarray]] = None,
    ids: Optional[List[Union[float, str]]] = None,
) -> pd.DataFrame:
    def _get_fold_value(
        value: Optional[Union[float, np.ndarray]], idx: int
    ) -> List[Optional[Union[float, np.ndarray]]]:
        if value is None:
            return [None]
        if isinstance(value[idx], float):
            return value[idx]
        if isinstance(value[idx], np.ndarray):
            return value[idx].reshape(-1)
        raise TypeError(f"Unexpected value type. Value: {value}")

    df_res_dict = {}

    for idx_fold in range(cv):
        # Fill df_res_dict
        for name, value in [("y_true", y_true), ("y_pred", y_pred)]:
            df_res_dict[f"{name}_{idx_fold+1}"] = _get_fold_value(value, idx_fold)
        if regime != "local":
            df_res_dict[f"id_{idx_fold+1}"] = _get_fold_value(ids, idx_fold)

    # Save datasets to specified directory
    df_res = pd.DataFrame(df_res_dict)
    return df_res

In [12]:
def expand_val_with_train(train_data, val_data, id_column, date_column, history):
    L_split_data = train_data[date_column].values[(len(train_data) - history)]
    L_last_train_data = train_data[train_data[date_column] >= L_split_data]
    val_data_expanded = pd.concat((L_last_train_data, val_data))
    val_data_expanded = val_data_expanded.sort_values([id_column, date_column]).reset_index(
        drop=True
    )
    return val_data_expanded


def expand_test_with_val_and_train(
    train_data, val_data, test_data, id_column, date_column, history
):
    unqiue_id_cnt = val_data[id_column].nunique()
    L_split_data = val_data[date_column].values[
        (
            (len(val_data) - history)
            if (len(val_data) // val_data[id_column].nunique() - history) > 0
            else 0
        )
    ]
    L_last_val_data = val_data[val_data[date_column] >= L_split_data]
    if len(val_data) // unqiue_id_cnt - history < 0:
        if (len(train_data) - (history - len(L_last_val_data) / unqiue_id_cnt)) > 0:
            L_split_data = train_data[date_column].values[
                (
                    len(train_data) // unqiue_id_cnt
                    - (history - len(L_last_val_data) // unqiue_id_cnt)
                )
            ]
        else:
            L_split_data = 0
        L_last_train_data = train_data[train_data[date_column] >= L_split_data]
        test_data_expanded = pd.concat((L_last_train_data, L_last_val_data, test_data))
    else:
        test_data_expanded = pd.concat((L_last_val_data, test_data))
    test_data_expanded = test_data_expanded.sort_values([id_column, date_column]).reset_index(
        drop=True
    )
    return test_data_expanded


def get_train_val_test_datasets(dataset_path, columns_params, train_size, test_size, history):
    data = pd.read_csv(dataset_path)

    date_column = columns_params["date"]["columns"][0]
    id_column = columns_params["id"]["columns"][0]

    if dataset_path.parts[-1] in ["ETTh1.csv", "ETTh2.csv", "ETTm1.csv", "ETTm2.csv"]:
        train_val_split_data = "2017-06-25 23:00:00"
        val_test_slit_data = "2017-10-23 23:00:00"
    else:
        train_val_split_data = data[date_column].values[
            int(data[date_column].nunique() * train_size)
        ]
        val_test_slit_data = data[date_column].values[
            int(data[date_column].nunique() * (1 - test_size))
        ]

    train_data = data[data[date_column] <= train_val_split_data]
    val_data = data[
        (data[date_column] > train_val_split_data) & (data[date_column] <= val_test_slit_data)
    ]
    test_data = data[data[date_column] > val_test_slit_data]
    val_data = expand_val_with_train(train_data, val_data, id_column, date_column, history)
    test_data_expanded = expand_test_with_val_and_train(
        train_data, val_data, test_data, id_column, date_column, history
    )

    # train, val and test TSDataset initialization
    train_dataset = TSDataset(
        data=train_data,
        columns_params=columns_params,
    )

    val_dataset = TSDataset(
        data=val_data,
        columns_params=columns_params,
    )

    test_dataset = TSDataset(
        data=test_data_expanded,
        columns_params=columns_params,
    )

    return train_dataset, val_dataset, test_dataset

## Initialize TSDataset, Pipeline, Model, Validator, Strategy

### TSDataset

In [13]:
TRAIN_SIZE = 0.7
TEST_SIZE = 0.2
history = 7

df_path = Path("datasets/global/simulated_data_to_check.csv")

columns_params = {
    "target": {
        "columns": ["value"],
        "type": "continious",
    },
    "date": {
        "columns": ["date"],
        "type": "datetime",
    },
    "id": {
        "columns": ["id"],
        "type": "categorical",
    }
}

train_dataset, val_dataset, test_dataset = get_train_val_test_datasets(
    df_path, columns_params, TRAIN_SIZE, TEST_SIZE, history
)

### Pipeline

In [14]:
lag = LagTransformer(lags=7)
target_generator = TargetGenerator()

union_1 = UnionTransformer(transformers_list=[lag, target_generator])
seq_1 = SequentialTransformer(transformers_list=[union_1], input_features=["value"])
union = UnionTransformer(transformers_list=[seq_1])

pipeline = Pipeline(union, multivariate=True)

### Trainer

In [15]:
# Configure the model parameters
model = DLinear_NN
model_params = {"moving_avg": 7, "individual": False, "enc_in": None}

validation = HoldOutValidator
validation_params = {"validation_data": val_dataset}

trainer_params = {
    "device": "cpu",
    "num_workers": 0,
    "stop_by_metric": True,
}

trainer = DLTrainer(
    model, 
    model_params, 
    validation, 
    validation_params, 
    **trainer_params
)

### Strategy

In [16]:
horizon = 7
model_horizon = 7
history = 7
step = 1

In [17]:
strategy = MIMOStrategy(
    pipeline=pipeline,
    trainer=trainer,
    horizon=horizon,
    history=history,
    step=step,
)

In [18]:
strategy.fit(train_dataset)

Epoch 1/10, Loss: 272.3933
Validation, Loss: 29.0868, Metric: -29.0868
Validation, Loss: 29.0868, Metric: -29.0868
Model saved to checkpoints/fold_0/model_0.pth
Epoch 2/10, Loss: 32.3303
Validation, Loss: 13.5448, Metric: -13.5448
Validation, Loss: 13.5448, Metric: -13.5448
Model saved to checkpoints/fold_0/model_1.pth
Epoch 3/10, Loss: 12.8174
Validation, Loss: 9.9666, Metric: -9.9666
Validation, Loss: 9.9666, Metric: -9.9666
Model saved to checkpoints/fold_0/model_2.pth
Epoch 4/10, Loss: 10.3944
Validation, Loss: 9.1675, Metric: -9.1675
Validation, Loss: 9.1675, Metric: -9.1675
Model saved to checkpoints/fold_0/model_3.pth
Epoch 5/10, Loss: 10.1024
Validation, Loss: 9.0681, Metric: -9.0681
Validation, Loss: 9.0681, Metric: -9.0681
Model saved to checkpoints/fold_0/model_4.pth
Epoch 6/10, Loss: 10.0179
Validation, Loss: 8.9633, Metric: -8.9633
Validation, Loss: 8.9633, Metric: -8.9633
Removing worst model snapshot: checkpoints/fold_0/model_0.pth
Model saved to checkpoints/fold_0/model

(8.068553924560547, <tsururu.strategies.mimo.MIMOStrategy at 0x30066bfd0>)