In [1]:
import warnings
warnings.filterwarnings("ignore")

from typing import Optional, List, Union
from numpy.typing import NDArray

import numpy as np
import pandas as pd

from tsururu.strategies import StrategiesFactory
from tsururu.dataset import TSDataset

In [2]:
def get_results(
    cv: int,
    regime: str,
    y_true: Optional[List[NDArray[np.floating]]] = None,
    y_pred: Optional[List[NDArray[np.floating]]] = None,
    ids: Optional[List[Union[float, str]]] = None,
) -> pd.DataFrame:
    def _get_fold_value(
        value: Optional[Union[float, NDArray[np.floating]]], idx: int
    ) -> List[Optional[Union[float, NDArray[np.floating]]]]:
        if value is None:
            return [None]
        if isinstance(value[idx], float):
            return value[idx]
        if isinstance(value[idx], np.ndarray):
            return value[idx].reshape(-1)
        raise TypeError(f"Unexpected value type. Value: {value}")

    df_res_dict = {}

    for idx_fold in range(cv):
        # Fill df_res_dict
        for name, value in [("y_true", y_true), ("y_pred", y_pred)]:
            df_res_dict[f"{name}_{idx_fold+1}"] = _get_fold_value(
                value, idx_fold
            )
        if regime != "local":
            df_res_dict[f"id_{idx_fold+1}"] = _get_fold_value(ids, idx_fold)

    # Save datasets to specified directory
    df_res = pd.DataFrame(df_res_dict)
    return df_res

## Pipeline setup

There are four main objects in total:
1) `TSDataset`.
2) `Strategy`
3) `Model`: it is a part of Strategy
4) `Transformers`.

Special attention should be paid to the `Transformer` class: the elements of the pipeline that are responsible for transforming the values of a series and generating features. 

There are several types of transformers: 
- `SeriesToSeries` (take as input train, test in the form of the original series (id, datetime, features) and output train, test in the same structure).
- `SeriesToFeatures` (takes as input train, test as a source series (id, datetime, features) and outputs tables with features).
- `FeaturesToFeatures` (take as input TABLE data in pd.DataFrame format, make transformations and output pd.DataFrame).

Below is a list of available Transformers: 
- `StandardScalerTransformer` - SeriesToSeries.
- `LabelEncodingTransformer` - SeriesToSeries (encoder for categorical features).
- `OneHotEncodingTransformer` - SeriesToSeries (encoder for categorical features).
- `LastKnownNormalizer` - FeaturesToFeatures (after building features we normalize all lags by the last known one: divide by it or subtract).
- `DifferenceNormalizer` - SeriesToSeries (subtract the previous value or divide by it).
- `TimeToNumGenerator`, `DateSeasonsGenerator` - SeriesToFeatures (generate seasonal fixtures by dates).
- `LagTransformer` - SeriesToFeatures.

Now only one of [`StandardScalerTransformer`, `DifferenceNormalizer` and `LastKnownNormalizer`] can be used at a time, and the first two must be before LagTransformer and the last one after. 

In [3]:
# for global and multivariate all time seires should be in one .csv file, divided by id 
# for local each time serie should be in its own .csv file also with id column 
df_path = "datasets/global/ettm1.csv"
df = pd.read_csv(df_path, sep=',')

# Configure the features settings
columns_and_features_params = {
    "target": {
        "column": ["value"],
        "type": "continious",
        "features": {
            "LagTransformer": {"lags": 30},
            "LastKnownNormalizer": {
                "regime": "ratio",
                "transform_train": False,
                "transform_target": False
            } 
        },
        "drop_raw_feature": False,
    },
    "date": {
        "column": ["date"],
        "type": "datetime",
        "features": {
            "DateSeasonsGenerator": {
                # Use seasonality features from the date column as features with datetime lags
                # Possible values: [
                #    "y": year, "m": month, "d": day, 
                #    "wd": weekday, "doy": dayofyear,
                #    "hour": hour, "min": minute, "sec": second, 
                #    "ms": microsecond,  "ns": nanosecond
                # ]
                "seasonalities": ['doy', 'm', 'wd'], 
                # Use date from target point to make datetime features
                "from_target_date": True,
            },
            "LagTransformer": {"lags": 7}
        },
        "drop_raw_feature": True,
    },
    "id": {
        "column": ["id"],
        "type": "categorical",
        "features": {
            "OneHotEncodingTransformer": {
                # possible values: one from ['first', 'if_binary', None] or 
                # array-list of shape (n_features, )
                "drop": "first",
            },
            "LagTransformer": {"lags": 1},
        },
        "drop_raw_feature": True,
    }
}

# Configure the model parameters
model_params = {
    "loss_function": "MultiRMSE",
    "early_stopping_rounds": 100,
    "verbose": 500,
}

# Configure the validation parameters
validation_params = {
    "type": 'KFold',
    "n_splits": 3,
}

# Configure the strategies parameters
strategy_params = {
    "is_multivariate": False, 
    # possible values: 
    # [
    #     'RecursiveStrategy', 
    #     'DirectStrategy', 
    #     'DirRecStrategy', 
    #     'MIMOStrategy', 
    #     'FlatWideMIMOStrategy', 
    # ]
    "strategy_name": "RecursiveStrategy",
    "strategy_params": {
        "horizon": 7,
        "model_name": "CatBoostRegressor_CV",
        "model_params": model_params,
        "validation_params": validation_params,
        "k": 1,
    }
}

In [4]:
strategies_factory = StrategiesFactory()

In [5]:
dataset = TSDataset(
    data=pd.read_csv(df_path),
    columns_and_features_params=columns_and_features_params,
    history=30,
)

strategy = strategies_factory[strategy_params]

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds


## Backtest validation of pipeline

In [6]:
ids, test, pred, fit_time, forecast_time, num_iterations = strategy.back_test(dataset, cv=1)

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds
0:	learn: 6.3540778	test: 6.3595012	best: 6.3595012 (0)	total: 169ms	remaining: 2m 48s
500:	learn: 0.8374041	test: 0.8391914	best: 0.8391914 (500)	total: 5.34s	remaining: 5.32s
999:	learn: 0.8061986	test: 0.8299307	best: 0.8299307 (999)	total: 10.2s	remaining: 0us

bestTest = 0.8299306676
bestIteration = 999

Fold 0:
MultiRMSE: 0.829930667603048
0:	learn: 6.3561601	test: 6.3556903	best: 6.3556903 (0)	total: 20.5ms	remaining: 20.5s
500:	learn: 0.8322678	test: 0.8527048	best: 0.8527048 (500)	total: 4.96s	remaining: 4.94s
999:	learn: 0.8020330	test: 0.8415058	best: 0.8415058 (999)	total: 9.93s	remaining: 0us

bestTest = 0.8415057703
bestIteration = 999

Fold 1:
MultiRMSE: 0.8415057703301744
0:	learn: 6.3573741	test: 6.3524574	best: 6.3524574 (0)	total: 13.3ms	remaining: 13.3s
500:	learn: 0.8303641	test: 0.8563275	best: 0.8563275 (500)	total: 4.95s	remaining: 4.93s
999:	learn: 0.8003772	test: 0.8454431	best: 0.8454116 (997)	t

In [7]:
get_results(cv=1, regime="global", y_true=test, y_pred=pred, ids=ids)

Unnamed: 0,y_true_1,y_pred_1,id_1
0,8.908,9.349326,0
1,8.841,9.502037,0
2,9.31,9.772129,0
3,10.114,10.088851,0
4,10.784,10.379277,0
5,11.655,10.704881,0
6,12.994,10.872077,0
7,3.684,3.70107,1
8,3.416,3.692558,1
9,3.55,3.664558,1


## Fit and predict interface of pipeline

In [8]:
fit_time, _ = strategy.fit(dataset)
forecast_time, current_pred = strategy.predict(dataset)

0:	learn: 6.3652935	test: 6.3360319	best: 6.3360319 (0)	total: 13.4ms	remaining: 13.3s
500:	learn: 0.8334411	test: 0.8480205	best: 0.8480205 (500)	total: 5.01s	remaining: 4.99s
999:	learn: 0.8025484	test: 0.8381997	best: 0.8381935 (998)	total: 9.82s	remaining: 0us

bestTest = 0.8381935466
bestIteration = 998

Shrink model to first 999 iterations.
Fold 0:
MultiRMSE: 0.838193546560607
0:	learn: 6.3463572	test: 6.3745481	best: 6.3745481 (0)	total: 13.8ms	remaining: 13.8s
500:	learn: 0.8353135	test: 0.8427162	best: 0.8427162 (500)	total: 4.88s	remaining: 4.87s
999:	learn: 0.8051036	test: 0.8329462	best: 0.8329462 (999)	total: 9.83s	remaining: 0us

bestTest = 0.8329461951
bestIteration = 999

Fold 1:
MultiRMSE: 0.8329461951218387
0:	learn: 6.3551717	test: 6.3562002	best: 6.3562002 (0)	total: 13.3ms	remaining: 13.3s
500:	learn: 0.8305221	test: 0.8561758	best: 0.8561758 (500)	total: 4.91s	remaining: 4.88s
999:	learn: 0.8003589	test: 0.8460295	best: 0.8460295 (999)	total: 9.67s	remaining: 0us


In [9]:
current_pred

Unnamed: 0,id,date,value
0,0,2018-06-26 20:00:00,12.936556
1,0,2018-06-26 20:15:00,13.293942
2,0,2018-06-26 20:30:00,13.675423
3,0,2018-06-26 20:45:00,13.947283
4,0,2018-06-26 21:00:00,14.086564
5,0,2018-06-26 21:15:00,14.306469
6,0,2018-06-26 21:30:00,14.414712
7,1,2018-06-26 20:00:00,3.712436
8,1,2018-06-26 20:15:00,3.729355
9,1,2018-06-26 20:30:00,3.702139


## Working with raw time series' granularity

Time series come in different granularities, from hourly and daily time series to more complex ones such as the end of each quarter.

If the rows do not contain segments that are too short (that are shorter than history + horizon), then `tsururu` will try to extract the row granularity on its own. We currently support the following types:

- Yearly (and YearlyEnd)
- Quarterly (and Quarterly)
- Monthly (and MonthlyEnd)
- Weekly
- Daily
- Hourly
- Minlutely
- Secondly
- Microsecondly

There is also support for compound granularities (10 days, 15 minutes, 32 seconds, etc.). The correctness of the selected granularity can be checked from the output after the `Dataset` class has been created.

However, there are tricky situations (e.g. 28 days) where the monthly granularity may be guessed incorrectly. Therefore, it is possible to set your own granularity using the `pd.DateOffset` class or related classes from `pandas.tseries.offsets`, which must be fed as `delta` parameter into the `Dataset` class. Then the time column will be processed according to the user's settings.

Consider a time series where each point is exactly __28 daily points away__ from each other

In [10]:
df_path_2 = "datasets/global/simulated_data_to_check_28D.csv"

# Configure the features settings
columns_and_features_params_2 = {
    "target": {
        "column": ["value"],
        "type": "continious",
    },
    "date": {
        "column": ["date"],
        "type": "datetime",
    },
    "id": {
        "column": ["id"],
        "type": "categorical",
    }
}

In [11]:
dataset_2 = TSDataset(
    data=pd.read_csv(df_path_2),
    columns_and_features_params=columns_and_features_params_2,
    history=30,
)

freq: Month; period: 1.0


We see that the frequency of the series is incorrectly defined as monthly. Let's try to pass the `delta` parameter.

In [12]:
dataset_2 = TSDataset(
    data=pd.read_csv(df_path_2),
    columns_and_features_params=columns_and_features_params_2,
    history=30,
    delta=pd.DateOffset(days=28),
)

Custom OffSet: <DateOffset: days=28>


Now it's all detected correctly.

In [13]:
model_params_randforest = {
    'n_estimators': 50,           # Add more if needed
    'criterion': 'squared_error',
    'max_depth': None,
    'verbose': 0,
    'n_jobs': -1,
}

strategy_params_rf = {
    "is_multivariate": False, 
    # possible values: 
    # [
    #     'RecursiveStrategy', 
    #     'DirectStrategy', 
    #     'DirRecStrategy', 
    #     'MIMOStrategy', 
    #     'FlatWideMIMOStrategy', 
    # ]
    "strategy_name": "RecursiveStrategy",
    "strategy_params": {
        "horizon": 96,
        "model_name": "RandomForest_CV",
        "model_params": model_params_randforest,
        "validation_params": validation_params,
        "k": 1,
    }
}

strategy_rf = strategies_factory[strategy_params_rf]

In [14]:
%%time
ids_rf, test_rf, pred_rf, fit_time_rf, forecast_time_rf, num_iterations_rf = strategy_rf.back_test(dataset, cv=1)

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds
Fold 0: Score: 0.9842271794799862
Fold 1: Score: 0.9837466960561283
Fold 2: Score: 0.9836520340612356
Mean Score: 0.9839
Std: 0.0003
CPU times: total: 1h 21min 15s
Wall time: 4min 37s


In [15]:
rf_preds = get_results(cv=1, regime="global", y_true=test_rf, y_pred=pred_rf, ids=ids_rf)
rf_preds[rf_preds.id_1==0]

Unnamed: 0,y_true_1,y_pred_1,id_1
0,12.994,12.363153,0
1,13.061,12.462653,0
2,12.726,12.55026,0
3,12.659,12.642293,0
4,12.458,12.630727,0
...,...,...,...
91,9.310,10.447507,0
92,10.114,10.444733,0
93,10.784,10.454027,0
94,11.655,10.428547,0


In [17]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

smape_rf = round(smape(np.array(rf_preds[rf_preds.id_1==0].y_true_1), np.array(rf_preds[rf_preds.id_1==0].y_pred_1)),2)
smape_rf

87.8

In [18]:
stat_models_cols_features = {
    "target": {
        "column": ["value"],
        "type": "continious",
    },
    "date": {
        "column": ["date"],
        "type": "datetime",
    },
    "id": {
        "column": ["id"],
        "type": "categorical",
    }
}

In [19]:
ets_params = {
    'model': ['A', 'N', 'N'],
    'season_length': 1,
    'damped': None,
    'phi': None
}

ets_strategy_params = {
    "is_multivariate": False, 
    "strategy_name": "StatModels",
    "strategy_params": {
        "horizon": 96,
        "model_name": "ETS_Model",
        "model_params": ets_params,
        "validation_params": validation_params,
        "k": 1,
    }
}

In [20]:
strategies_factory_ets = StrategiesFactory()

In [21]:
dataset_ets = TSDataset(
    data=pd.read_csv(df_path),
    columns_and_features_params=stat_models_cols_features,
    history=100,
)

strategy_ets = strategies_factory_ets[ets_strategy_params]

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds


In [22]:
ids, test, pred, fit_time, forecast_time, num_iterations = strategy_ets.back_test(dataset_ets, cv=1)

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds


In [23]:
get_results(cv=1, regime="global", y_true=test, y_pred=pred, ids=ids)

Unnamed: 0,y_true_1,y_pred_1,id_1
0,12.994,10.060014,0
1,13.061,10.060014,0
2,12.726,10.060014,0
3,12.659,10.060014,0
4,12.458,10.060014,0
...,...,...,...
667,9.567,10.060014,6
668,9.567,10.060014,6
669,9.426,10.060014,6
670,9.426,10.060014,6


In [24]:
autoarima_params = {
    'd': None,
    'D': None,
    'max_p': 5,
    'max_q': 5,
    'max_P': 2,
    'max_Q': 2,
    'max_order': 5,
    'max_d': 2,
    'max_D': 1,
    'start_p': 2,
    'start_q': 2,
    'start_P': 1,
    'start_Q': 1,
    'stationary': False,
    'seasonal': True,
    'season_length': 1,
}

autoarima_strategy_params = {
    "is_multivariate": False, 
    "strategy_name": "StatModels",
    "strategy_params": {
        "horizon": 96,
        "model_name": "AutoARIMA_Model",
        "model_params": autoarima_params,
        "validation_params": validation_params,
        "k": 1,
    }
}

In [25]:
strategies_factory_arima = StrategiesFactory()

dataset_arima = TSDataset(
    data=pd.read_csv(df_path),
    columns_and_features_params=stat_models_cols_features,
    history=100,
)

strategy_autoarima = strategies_factory_arima[autoarima_strategy_params]

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds


In [26]:
ids, test, pred, fit_time, forecast_time, num_iterations = strategy_autoarima.back_test(dataset_arima, cv=1)

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds


In [27]:
get_results(cv=1, regime="global", y_true=test, y_pred=pred, ids=ids)

Unnamed: 0,y_true_1,y_pred_1,id_1
0,12.994,10.056074,0
1,13.061,10.063438,0
2,12.726,10.067070,0
3,12.659,10.068252,0
4,12.458,10.068557,0
...,...,...,...
667,9.567,10.068626,6
668,9.567,10.068626,6
669,9.426,10.068626,6
670,9.426,10.068626,6


In [28]:
theta_params = {
    'season_length': 1, 
}

theta_strategy_params = {
    "is_multivariate": False, 
    "strategy_name": "StatModels",
    "strategy_params": {
        "horizon": 96,
        "model_name": "AutoTheta_Model",
        "model_params": theta_params,
        "validation_params": validation_params,
        "k": 1,
    }
}

In [29]:
strategies_factory_theta = StrategiesFactory()

dataset_theta = TSDataset(
    data=pd.read_csv(df_path),
    columns_and_features_params=stat_models_cols_features,
    history=100,
)

strategy_theta = strategies_factory_theta[theta_strategy_params]

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds


In [30]:
ids, test, pred, fit_time, forecast_time, num_iterations = strategy_theta.back_test(dataset_theta, cv=1)

freq: less then Day (Hour, Min, Sec, etc); period: 900.0 seconds
Model 1 has been fitted!
Model 2 has been fitted!
Model 3 has been fitted!
Model 4 has been fitted!
Model 5 has been fitted!
Model 6 has been fitted!
Model 7 has been fitted!


In [31]:
get_results(cv=1, regime="global", y_true=test, y_pred=pred, ids=ids)

Unnamed: 0,y_true_1,y_pred_1,id_1
0,12.994,6.324453,0
1,13.061,6.324190,0
2,12.726,6.323927,0
3,12.659,6.323664,0
4,12.458,6.323400,0
...,...,...,...
667,9.567,6.300528,6
668,9.567,6.300265,6
669,9.426,6.300002,6
670,9.426,6.299739,6
