In [1]:
import warnings
warnings.filterwarnings("ignore")

from typing import Optional, List, Union
from numpy.typing import NDArray

import numpy as np
import pandas as pd

from tsururu.strategies import StrategiesFactory
from tsururu.dataset import TSDataset

In [2]:
def get_results(
    cv: int,
    regime: str,
    y_true: Optional[List[NDArray[np.floating]]] = None,
    y_pred: Optional[List[NDArray[np.floating]]] = None,
    ids: Optional[List[Union[float, str]]] = None,
) -> pd.DataFrame:
    def _get_fold_value(
        value: Optional[Union[float, NDArray[np.floating]]], idx: int
    ) -> List[Optional[Union[float, NDArray[np.floating]]]]:
        if value is None:
            return [None]
        if isinstance(value[idx], float):
            return value[idx]
        if isinstance(value[idx], np.ndarray):
            return value[idx].reshape(-1)
        raise TypeError(f"Unexpected value type. Value: {value}")

    df_res_dict = {}

    for idx_fold in range(cv):
        # Fill df_res_dict
        for name, value in [("y_true", y_true), ("y_pred", y_pred)]:
            df_res_dict[f"{name}_{idx_fold+1}"] = _get_fold_value(
                value, idx_fold
            )
        if regime != "local":
            df_res_dict[f"id_{idx_fold+1}"] = _get_fold_value(ids, idx_fold)

    # Save datasets to specified directory
    df_res = pd.DataFrame(df_res_dict)
    return df_res

## Pipeline setup

There are four main objects in total:
1) `TSDataset`.
2) `Strategy`
3) `Model`: it is a part of Strategy
4) `Transformers`.

Special attention should be paid to the `Transformer` class: the elements of the pipeline that are responsible for transforming the values of a series and generating features. 

There are several types of transformers: 
- `SeriesToSeries` (take as input train, test in the form of the original series (id, datetime, features) and output train, test in the same structure).
- `SeriesToFeatures` (takes as input train, test as a source series (id, datetime, features) and outputs tables with features).
- `FeaturesToFeatures` (take as input TABLE data in pd.DataFrame format, make transformations and output pd.DataFrame).

Below is a list of available Transformers: 
- `StandardScalerTransformer` - SeriesToSeries.
- `LabelEncodingTransformer` - SeriesToSeries (encoder for categorical features).
- `OneHotEncodingTransformer` - SeriesToSeries (encoder for categorical features).
- `LastKnownNormalizer` - FeaturesToFeatures (after building features we normalize all lags by the last known one: divide by it or subtract).
- `DifferenceNormalizer` - SeriesToSeries (subtract the previous value or divide by it).
- `TimeToNumGenerator`, `DateSeasonsGenerator` - SeriesToFeatures (generate seasonal fixtures by dates).
- `LagTransformer` - SeriesToFeatures.

Now only one of [`StandardScalerTransformer`, `DifferenceNormalizer` and `LastKnownNormalizer`] can be used at a time, and the first two must be before LagTransformer and the last one after. 

In [3]:
# for global and multivariate all time seires should be in one .csv file, divided by id 
# for local each time serie should be in its own .csv file also with id column 
df_path = "datasets/global/simulated_data_to_check.csv"

# Configure the features settings
columns_and_features_params = {
    "target": {
        "column": ["value"],
        "type": "continious",
        "features": {
            "LagTransformer": {"lags": 30},
            "LastKnownNormalizer": {
                "regime": "ratio",
                "transform_train": True,
                "transform_target": True
            } 
        },
        "drop_raw_feature": False,
    },
    "date": {
        "column": ["date"],
        "type": "datetime",
        "features": {
            "DateSeasonsGenerator": {
                # Use seasonality features from the date column as features with datetime lags
                # Possible values: [
                #    "y": year, "m": month, "d": day, 
                #    "wd": weekday, "doy": dayofyear,
                #    "hour": hour, "min": minute, "sec": second, 
                #    "ms": microsecond,  "ns": nanosecond
                # ]
                "seasonalities": ['doy', 'm', 'wd'], 
                # Use date from target point to make datetime features
                "from_target_date": True,
            },
            "LagTransformer": {"lags": 7}
        },
        "drop_raw_feature": True,
    },
    "id": {
        "column": ["id"],
        "type": "categorical",
        "features": {
            "OneHotEncodingTransformer": {
                # possible values: one from ['first', 'if_binary', None] or 
                # array-list of shape (n_features, )
                "drop": "first",
            },
            "LagTransformer": {"lags": 1},
        },
        "drop_raw_feature": True,
    }
}

# Configure the model parameters
model_params = {
    "loss_function": "MultiRMSE",
    "early_stopping_rounds": 100,
    "verbose": 500,
}

# Configure the validation parameters
validation_params = {
    "type": 'KFold',
    "n_splits": 3,
}

# Configure the strategies parameters
strategy_params = {
    "is_multivariate": False, 
    # possible values: 
    # [
    #     'RecursiveStrategy', 
    #     'DirectStrategy', 
    #     'DirRecStrategy', 
    #     'MIMOStrategy', 
    #     'FlatWideMIMOStrategy', 
    # ]
    "strategy_name": "RecursiveStrategy",
    "strategy_params": {
        "horizon": 7,
        "model_name": "CatBoostRegressor_CV",
        "model_params": model_params,
        "validation_params": validation_params,
        "k": 1,
    }
}

In [4]:
strategies_factory = StrategiesFactory()

In [5]:
dataset = TSDataset(
    data=pd.read_csv(df_path),
    columns_and_features_params=columns_and_features_params,
    history=30,
)

strategy = strategies_factory[strategy_params]

freq: Day; period: 1


## Backtest validation of pipeline

In [6]:
ids, test, pred, fit_time, forecast_time, num_iterations = strategy.back_test(dataset, cv=1)

freq: Day; period: 1
0:	learn: 0.0001731	test: 0.0001793	best: 0.0001793 (0)	total: 55.4ms	remaining: 55.3s


500:	learn: 0.0000015	test: 0.0000016	best: 0.0000016 (500)	total: 2.03s	remaining: 2.02s
999:	learn: 0.0000011	test: 0.0000012	best: 0.0000012 (999)	total: 3.59s	remaining: 0us

bestTest = 1.201404258e-06
bestIteration = 999

Fold 0:
MultiRMSE: 1.2014042581393327e-06
0:	learn: 0.0001756	test: 0.0001743	best: 0.0001743 (0)	total: 3.78ms	remaining: 3.78s
500:	learn: 0.0000015	test: 0.0000015	best: 0.0000015 (500)	total: 1.12s	remaining: 1.12s
999:	learn: 0.0000010	test: 0.0000011	best: 0.0000011 (999)	total: 2.17s	remaining: 0us

bestTest = 1.14810964e-06
bestIteration = 999

Fold 1:
MultiRMSE: 1.1481096401734672e-06
0:	learn: 0.0001768	test: 0.0001718	best: 0.0001718 (0)	total: 3.62ms	remaining: 3.61s
500:	learn: 0.0000015	test: 0.0000016	best: 0.0000016 (500)	total: 1.08s	remaining: 1.07s
999:	learn: 0.0000011	test: 0.0000011	best: 0.0000011 (999)	total: 2.43s	remaining: 0us

bestTest = 1.14563283e-06
bestIteration = 999

Fold 2:
MultiRMSE: 1.145632830147968e-06
Mean MultiRMSE: 0.0
St

In [7]:
get_results(cv=1, regime="global", y_true=test, y_pred=pred, ids=ids)

Unnamed: 0,y_true_1,y_pred_1,id_1
0,1993.0,1993.004623,0
1,1994.0,1994.00973,0
2,1995.0,1995.015565,0
3,1996.0,1996.022392,0
4,1997.0,1997.030004,0
...,...,...,...
65,10995.0,10995.018659,9
66,10996.0,10996.027113,9
67,10997.0,10997.038627,9
68,10998.0,10998.051501,9


## Fit and predict interface of pipeline

In [8]:
fit_time, _ = strategy.fit(dataset)
forecast_time, current_pred = strategy.predict(dataset)

0:	learn: 0.0001721	test: 0.0001802	best: 0.0001802 (0)	total: 6.54ms	remaining: 6.53s
500:	learn: 0.0000015	test: 0.0000017	best: 0.0000017 (500)	total: 1.3s	remaining: 1.29s
999:	learn: 0.0000010	test: 0.0000012	best: 0.0000012 (999)	total: 2.76s	remaining: 0us

bestTest = 1.221616744e-06
bestIteration = 999

Fold 0:
MultiRMSE: 1.2216167442345852e-06
0:	learn: 0.0001756	test: 0.0001732	best: 0.0001732 (0)	total: 4.43ms	remaining: 4.42s
500:	learn: 0.0000015	test: 0.0000016	best: 0.0000016 (500)	total: 1.57s	remaining: 1.56s
999:	learn: 0.0000010	test: 0.0000012	best: 0.0000012 (999)	total: 3.04s	remaining: 0us

bestTest = 1.154839242e-06
bestIteration = 999

Fold 1:
MultiRMSE: 1.1548392417120448e-06
0:	learn: 0.0001767	test: 0.0001710	best: 0.0001710 (0)	total: 3.77ms	remaining: 3.77s
500:	learn: 0.0000015	test: 0.0000014	best: 0.0000014 (500)	total: 1.14s	remaining: 1.14s
999:	learn: 0.0000010	test: 0.0000010	best: 0.0000010 (999)	total: 2.8s	remaining: 0us

bestTest = 1.049221253e-

In [9]:
current_pred

Unnamed: 0,id,date,value
0,0,2022-09-27,2000.010015
1,0,2022-09-28,2001.019949
2,0,2022-09-29,2002.030814
3,0,2022-09-30,2003.042475
4,0,2022-10-01,2004.054736
...,...,...,...
65,9,2022-09-29,11002.024389
66,9,2022-09-30,11003.034698
67,9,2022-10-01,11004.048196
68,9,2022-10-02,11005.062914


## Working with raw time series' granularity

Time series come in different granularities, from hourly and daily time series to more complex ones such as the end of each quarter.

If the rows do not contain segments that are too short (that are shorter than history + horizon), then `tsururu` will try to extract the row granularity on its own. We currently support the following types:

- Yearly (and YearlyEnd)
- Quarterly (and Quarterly)
- Monthly (and MonthlyEnd)
- Weekly
- Daily
- Hourly
- Minlutely
- Secondly
- Microsecondly

There is also support for compound granularities (10 days, 15 minutes, 32 seconds, etc.). The correctness of the selected granularity can be checked from the output after the `Dataset` class has been created.

However, there are tricky situations (e.g. 28 days) where the monthly granularity may be guessed incorrectly. Therefore, it is possible to set your own granularity using the `pd.DateOffset` class or related classes from `pandas.tseries.offsets`, which must be fed as `delta` parameter into the `Dataset` class. Then the time column will be processed according to the user's settings.

Consider a time series where each point is exactly __28 daily points away__ from each other

In [10]:
df_path_2 = "datasets/global/simulated_data_to_check_28D.csv"

# Configure the features settings
columns_and_features_params_2 = {
    "target": {
        "column": ["value"],
        "type": "continious",
    },
    "date": {
        "column": ["date"],
        "type": "datetime",
    },
    "id": {
        "column": ["id"],
        "type": "categorical",
    }
}

In [11]:
dataset_2 = TSDataset(
    data=pd.read_csv(df_path_2),
    columns_and_features_params=columns_and_features_params_2,
    history=30,
)

freq: Month; period: 1.0


We see that the frequency of the series is incorrectly defined as monthly. Let's try to pass the `delta` parameter.

In [12]:
dataset_2 = TSDataset(
    data=pd.read_csv(df_path_2),
    columns_and_features_params=columns_and_features_params_2,
    history=30,
    delta=pd.DateOffset(days=28),
)

Custom OffSet: <DateOffset: days=28>


Now it's all detected correctly.