In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

import numpy as np
import pandas as pd
import skforecast

print(skforecast.__version__)

c:\Users\jaesc2\GitHub\skforecast
0.18.0


In [2]:
import re
import pytest
import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
from skforecast.model_selection._split import TimeSeriesFold
from skforecast.plot.plot import backtesting_gif_creator

In [3]:
y = pd.Series(np.arange(100))
y.index = pd.date_range(start='2022-01-01', periods=100, freq='D')
cv = TimeSeriesFold(
        steps                 = 11,
        initial_train_size    = 70,
        fold_stride           = 5,
        window_size           = 3,
        differentiation       = None,
        refit                 = True,
        fixed_train_size      = True,
        gap                   = 0,
        skip_folds            = None,
        allow_incomplete_fold = True,
        return_all_indexes    = False,
        verbose               = True
    )
folds = cv.split(X=y)
folds

Information of folds
--------------------
Number of observations used for initial training: 70
Number of observations used for backtesting: 30
    Number of folds: 6
    Number skipped folds: 0 
    Number of steps per fold: 11
    Number of steps to exclude between folds (fold stride): 5
    Number of steps to exclude between last observed data (last window) and predictions (gap): 0
    Last fold only includes 5 observations.

Fold: 0
    Training:   2022-01-01 00:00:00 -- 2022-03-11 00:00:00  (n=70)
    Validation: 2022-03-12 00:00:00 -- 2022-03-22 00:00:00  (n=11)
Fold: 1
    Training:   2022-01-06 00:00:00 -- 2022-03-16 00:00:00  (n=70)
    Validation: 2022-03-17 00:00:00 -- 2022-03-27 00:00:00  (n=11)
Fold: 2
    Training:   2022-01-11 00:00:00 -- 2022-03-21 00:00:00  (n=70)
    Validation: 2022-03-22 00:00:00 -- 2022-04-01 00:00:00  (n=11)
Fold: 3
    Training:   2022-01-16 00:00:00 -- 2022-03-26 00:00:00  (n=70)
    Validation: 2022-03-27 00:00:00 -- 2022-04-06 00:00:00  (n=11)


[[[0, 70], [67, 70], [70, 81], [70, 81], True],
 [[5, 75], [72, 75], [75, 86], [75, 86], True],
 [[10, 80], [77, 80], [80, 91], [80, 91], True],
 [[15, 85], [82, 85], [85, 96], [85, 96], True],
 [[20, 90], [87, 90], [90, 100], [90, 100], True],
 [[25, 95], [92, 95], [95, 100], [95, 100], True]]

In [4]:
backtesting_gif_creator(
    data=y,
    cv=cv,
    plot_last_window=False,
    filename="backtesting.gif",
    fps=1
)

Information of folds
--------------------
Number of observations used for initial training: 70
Number of observations used for backtesting: 30
    Number of folds: 6
    Number skipped folds: 0 
    Number of steps per fold: 11
    Number of steps to exclude between folds (fold stride): 5
    Number of steps to exclude between last observed data (last window) and predictions (gap): 0
    Last fold only includes 5 observations.

Fold: 0
    Training:   2022-01-01 00:00:00 -- 2022-03-11 00:00:00  (n=70)
    Validation: 2022-03-12 00:00:00 -- 2022-03-22 00:00:00  (n=11)
Fold: 1
    Training:   2022-01-06 00:00:00 -- 2022-03-16 00:00:00  (n=70)
    Validation: 2022-03-17 00:00:00 -- 2022-03-27 00:00:00  (n=11)
Fold: 2
    Training:   2022-01-11 00:00:00 -- 2022-03-21 00:00:00  (n=70)
    Validation: 2022-03-22 00:00:00 -- 2022-04-01 00:00:00  (n=11)
Fold: 3
    Training:   2022-01-16 00:00:00 -- 2022-03-26 00:00:00  (n=70)
    Validation: 2022-03-27 00:00:00 -- 2022-04-06 00:00:00  (n=11)


'c:\\Users\\jaesc2\\GitHub\\skforecast\\dev\\backtesting.gif'

In [5]:
# Libraries
# ==============================================================================
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from skforecast.datasets import fetch_dataset
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursive
from skforecast.model_selection import TimeSeriesFold
from skforecast.model_selection import backtesting_forecaster
from skforecast.plot import plot_prediction_intervals, set_dark_theme

In [6]:
data = fetch_dataset(
    name="h2o", raw=True, kwargs_read_csv={"names": ["y", "datetime"], "header": 0}
)

h2o
---
Monthly expenditure ($AUD) on corticosteroid drugs that the Australian health
system had between 1991 and 2008.
Hyndman R (2023). fpp3: Data for Forecasting: Principles and Practice(3rd
Edition). http://pkg.robjhyndman.com/fpp3package/,https://github.com/robjhyndman
/fpp3package, http://OTexts.com/fpp3.
Shape of the dataset: (204, 2)


In [7]:
# Data preprocessing
# ==============================================================================
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y-%m-%d')
data = data.set_index('datetime')
data = data.asfreq('MS')
data = data[['y']]
data = data.sort_index()
data.head(3)

Unnamed: 0_level_0,y
datetime,Unnamed: 1_level_1
1991-07-01,0.429795
1991-08-01,0.400906
1991-09-01,0.432159


In [9]:
# Create TimeSeriesFold
# ==============================================================================
end_train = '2002-01-01 23:59:00'
cv = TimeSeriesFold(
         steps                 = 10,
         initial_train_size    = len(data.loc[:end_train]),
         window_size           = 10,
         refit                 = True,
         fixed_train_size      = False,
         gap                   = 0,
         allow_incomplete_fold = True,
         verbose               = False
     )

In [16]:
# Backtesting forecaster
# ==============================================================================
forecaster = ForecasterRecursive(
                 regressor       = LGBMRegressor(random_state=123, verbose=-1),
                 lags            = 15,
                 window_features = RollingFeatures(stats=['mean'], window_sizes=[10])
             )

cv = TimeSeriesFold(
        steps                 = 10,
        initial_train_size    = len(data.loc[:end_train]),
        refit                 = True,
        fixed_train_size      = False,
        gap                   = 0,
        allow_incomplete_fold = True
     )

metric, predictions = backtesting_forecaster(
                          forecaster    = forecaster,
                          y             = data['y'],
                          cv            = cv,
                          metric        = 'mean_squared_error',
                          n_jobs        = 'auto',
                          verbose       = False,
                          show_progress = True
                      )
predictions

  0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,fold,pred
2002-02-01,0,0.570082
2002-03-01,0,0.731883
2002-04-01,0,0.697942
2002-05-01,0,0.752874
2002-06-01,0,0.730697
...,...,...
2008-02-01,7,0.608612
2008-03-01,7,0.705114
2008-04-01,7,0.560901
2008-05-01,7,0.746321
