In [20]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
c:\Users\jaesc2\GitHub\skforecast


In [21]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from scipy.stats import norm

from skforecast.datasets import fetch_dataset
from skforecast.preprocessing import series_long_to_dict
from skforecast.preprocessing import exog_long_to_dict
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import TimeSeriesFold
from skforecast.model_selection import backtesting_forecaster_multiseries
from skforecast.model_selection import grid_search_forecaster_multiseries
from skforecast.model_selection import bayesian_search_forecaster_multiseries

In [22]:
# Load time series of multiple lengths and exogenous variables
# ==============================================================================
series = pd.read_csv(
    'https://raw.githubusercontent.com/skforecast/skforecast-datasets/main/data/demo_multi_series.csv'
)
exog = pd.read_csv(
    'https://raw.githubusercontent.com/skforecast/skforecast-datasets/main/data/demo_multi_series_exog.csv'
)

series['timestamp'] = pd.to_datetime(series['timestamp'])
exog['timestamp'] = pd.to_datetime(exog['timestamp'])

display(series.head())
print("")
display(exog.head())

Unnamed: 0,series_id,timestamp,value
0,id_1000,2016-01-01,1012.500694
1,id_1000,2016-01-02,1158.500099
2,id_1000,2016-01-03,983.000099
3,id_1000,2016-01-04,1675.750496
4,id_1000,2016-01-05,1586.250694





Unnamed: 0,series_id,timestamp,sin_day_of_week,cos_day_of_week,air_temperature,wind_speed
0,id_1000,2016-01-01,-0.433884,-0.900969,6.416639,4.040115
1,id_1000,2016-01-02,-0.974928,-0.222521,6.366474,4.530395
2,id_1000,2016-01-03,-0.781831,0.62349,6.555272,3.273064
3,id_1000,2016-01-04,0.0,1.0,6.704778,4.865404
4,id_1000,2016-01-05,0.781831,0.62349,2.392998,5.228913


In [23]:
# Transform series and exog to dictionaries
# ==============================================================================
series_dict = series_long_to_dict(
    data      = series,
    series_id = 'series_id',
    index     = 'timestamp',
    values    = 'value',
    freq      = 'D'
)

exog_dict = exog_long_to_dict(
    data      = exog,
    series_id = 'series_id',
    index     = 'timestamp',
    freq      = 'D'
)



In [24]:
# Drop some exogenous variables for series 'id_1000' and 'id_1003'
# ==============================================================================
exog_dict['id_1000'] = exog_dict['id_1000'].drop(columns=['air_temperature', 'wind_speed'])
exog_dict['id_1003'] = exog_dict['id_1003'].drop(columns=['cos_day_of_week'])

In [25]:
# Partition data in train and test
# ==============================================================================
end_train = '2016-07-31 23:59:00'

series_dict_train = {k: v.loc[: end_train,] for k, v in series_dict.items()}
exog_dict_train   = {k: v.loc[: end_train,] for k, v in exog_dict.items()}
series_dict_test  = {k: v.loc[end_train:,] for k, v in series_dict.items()}
exog_dict_test    = {k: v.loc[end_train:,] for k, v in exog_dict.items()}

In [26]:
# Fit forecaster
# ==============================================================================
regressor = LGBMRegressor(random_state=123, verbose=-1, max_depth=5)
forecaster = ForecasterRecursiveMultiSeries(
                 regressor          = regressor, 
                 lags               = 14, 
                 window_features    = RollingFeatures(stats=['mean', 'mean'], window_sizes=[7, 14]),
                 encoding           = "ordinal", 
                 dropna_from_series = False
             )

forecaster.fit(series=series_dict_train, exog=exog_dict_train, suppress_warnings=True)
forecaster

In [27]:
# Backtesting
# ==============================================================================
forecaster = ForecasterRecursiveMultiSeries(
                 regressor          = regressor, 
                 lags               = 14, 
                 window_features    = RollingFeatures(stats=['mean', 'mean'], window_sizes=[7, 14]),
                 encoding           = "ordinal", 
                 dropna_from_series = False
             )

cv = TimeSeriesFold(
         steps                 = 24,
         initial_train_size    = len(series_dict_train["id_1000"]),
         refit                 = False,
         allow_incomplete_fold = True,
     )

metrics_levels, backtest_predictions = backtesting_forecaster_multiseries(
    forecaster            = forecaster,
    series                = series_dict,
    exog                  = exog_dict,
    cv                    = cv,
    levels                = None,
    metric                = "mean_absolute_error",
    add_aggregated_metric = True,
    n_jobs                ="auto",
    verbose               = False,
    interval              = [10],
    n_boot                = 25,
    show_progress         = True,
    suppress_warnings     = True
)

display(metrics_levels)
print("")
display(backtest_predictions)

  0%|          | 0/7 [00:00<?, ?it/s]

                id_1000  id_1000_p_10      id_1001  id_1001_p_10      id_1003  \
2016-08-01  1453.312971   1233.604392  2849.347882   2462.458516  2706.851726   
2016-08-02  1440.763196   1370.812959  2947.579536   2399.179675  2310.075968   
2016-08-03  1410.151437   1345.454183  2875.847691   2338.693454  1997.329410   
2016-08-04  1348.787299   1300.971925  3160.533645   2064.158062  1923.897012   
2016-08-05  1301.504387   1237.881633  2920.424937   1833.697833  1940.149954   
...                 ...           ...          ...           ...          ...   
2016-12-27  1667.998267   1602.508411  1108.052845    746.768274  2121.157763   
2016-12-28  1579.306861   1438.337836  1111.236661    680.248698  2050.252915   
2016-12-29  1487.230722   1395.944903  1113.581933    522.776632  2063.309008   
2016-12-30  1481.331642   1412.174225  1132.535774    675.424952  2089.261345   
2016-12-31  1393.128313   1330.295752  1106.034061    487.497038  2064.475030   

            id_1003_p_10   

Unnamed: 0,levels,mean_absolute_error
0,id_1000,167.502214
1,id_1001,1103.313887
2,id_1002,
3,id_1003,280.492603
4,id_1004,711.078359
5,average,565.596766
6,weighted_average,572.944127
7,pooling,572.944127





Unnamed: 0,id_1000,id_1000_p_10,id_1001,id_1001_p_10,id_1003,id_1003_p_10,id_1004,id_1004_p_10
2016-08-01,1453.312971,1233.604392,2849.347882,2462.458516,2706.851726,2535.517829,7496.555367,6241.491815
2016-08-02,1440.763196,1370.812959,2947.579536,2399.179675,2310.075968,2068.815084,8685.425990,7819.773586
2016-08-03,1410.151437,1345.454183,2875.847691,2338.693454,1997.329410,1758.755284,8961.631705,8464.439000
2016-08-04,1348.787299,1300.971925,3160.533645,2064.158062,1923.897012,1675.615404,8764.338331,8497.250633
2016-08-05,1301.504387,1237.881633,2920.424937,1833.697833,1940.149954,1681.037077,8694.134833,8387.322709
...,...,...,...,...,...,...,...,...
2016-12-27,1667.998267,1602.508411,1108.052845,746.768274,2121.157763,1768.368500,,
2016-12-28,1579.306861,1438.337836,1111.236661,680.248698,2050.252915,1590.971921,,
2016-12-29,1487.230722,1395.944903,1113.581933,522.776632,2063.309008,1662.584208,,
2016-12-30,1481.331642,1412.174225,1132.535774,675.424952,2089.261345,1732.781191,,


In [28]:
hasattr(None, "_pdf")

False

In [29]:
hasattr('boot', "_pdf")

False

In [30]:
hasattr([10], "_pdf")

False

In [31]:
from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.recursive import ForecasterRecursive
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.direct import ForecasterDirectMultiVariate
from skforecast.model_selection import backtesting_forecaster_multiseries
from skforecast.model_selection._split import TimeSeriesFold
from skforecast.preprocessing import RollingFeatures
from sklearn.linear_model import Ridge

# Fixtures
from skforecast.model_selection.tests.fixtures_model_selection_multiseries import series
from skforecast.model_selection.tests.fixtures_model_selection_multiseries import custom_metric

In [32]:
import joblib

series_dict = joblib.load(
    r"C:\Users\jaesc2\GitHub\skforecast\skforecast\model_selection\tests\fixture_sample_multi_series.joblib"
)

exog_dict = joblib.load(
    r"C:\Users\jaesc2\GitHub\skforecast\skforecast\model_selection\tests\fixture_sample_multi_series_exog.joblib"
)

In [33]:
forecaster = ForecasterRecursiveMultiSeries(
    regressor=LGBMRegressor(
        n_estimators=30, random_state=123, verbose=-1, max_depth=4
    ),
    lags=[1, 7, 14],
    encoding='ordinal',
    dropna_from_series=False,
    transformer_series=None,
    transformer_exog=StandardScaler(),
)

cv = TimeSeriesFold(
            initial_train_size = len(series_dict_train['id_1000']),
            steps              = 24,
            refit              = False
        )

metrics, predictions = backtesting_forecaster_multiseries(
    forecaster        = forecaster,
    series            = series_dict,
    exog              = exog_dict,
    cv                = cv,
    metric            = ['mean_absolute_error', 'mean_absolute_scaled_error'],
    interval          = norm,
    n_boot            = 25,
    n_jobs            = 'auto',
    verbose           = False,
    show_progress     = True,
    suppress_warnings = True
)

  0%|          | 0/7 [00:00<?, ?it/s]

                id_1000  id_1000_loc  id_1000_scale      id_1001  id_1001_loc  \
2016-08-01  1559.691828  1446.935820     171.663434  2934.363292  2738.992298   
2016-08-02  1572.804477  1481.355117      75.017015  3503.747502  2676.919097   
2016-08-03  1537.674947  1453.751766      84.729475  3354.275203  2732.565818   
2016-08-04  1480.694267  1429.975721      98.598613  3537.138899  3099.751220   
2016-08-05  1472.610905  1395.489597      89.077810  3200.844944  2749.721048   
...                 ...          ...            ...          ...          ...   
2016-12-27  1804.827483  1725.149850      86.500613  1345.266413  1165.241384   
2016-12-28  1726.629295  1631.354421      83.605712  1433.532343  1229.613638   
2016-12-29  1622.679349  1544.978673     103.637588  1687.538967  1564.188437   
2016-12-30  1658.431719  1539.075183     140.316399  1797.084928  1545.816578   
2016-12-31  1465.929905  1337.797616     144.814849  1817.367605  1500.298352   

            id_1001_scale  

In [34]:
metrics.to_dict()

{'levels': {0: 'id_1000',
  1: 'id_1001',
  2: 'id_1002',
  3: 'id_1003',
  4: 'id_1004',
  5: 'average',
  6: 'weighted_average',
  7: 'pooling'},
 'mean_absolute_error': {0: 177.94640447766702,
  1: 1451.3480109896332,
  2: nan,
  3: 277.78113362955673,
  4: 993.6769068120083,
  5: 725.1881139772163,
  6: 724.9604804988818,
  7: 724.960480498882},
 'mean_absolute_scaled_error': {0: 0.8178593233613526,
  1: 4.1364664709651064,
  2: nan,
  3: 1.1323827428361022,
  4: 0.8271748048818786,
  5: 1.72847083551111,
  6: 2.0965105153721213,
  7: 1.760615501057647}}

In [35]:
predictions.head(10).to_numpy()

array([[1559.69182787, 1446.93581986,  171.66343358, 2934.36329187,
        2738.99229845,  476.13018745, 3392.60955028, 3285.49512256,
         254.33164031, 7097.05447923, 6901.15599599,  930.8659121 ],
       [1572.80447653, 1481.35511731,   75.01701475, 3503.74750241,
        2676.91909653,  595.80407354, 3118.04939083, 2945.42032254,
         434.03240802, 8301.53364485, 8004.38824145,  905.03703795],
       [1537.67494683, 1453.75176554,   84.7294755 , 3354.2752034 ,
        2732.56581839,  678.25751156, 3118.04939083, 2813.94368549,
         637.92012714, 8466.83628992, 8267.68914986,  792.49132562],
       [1480.69426693, 1429.97572138,   98.59861312, 3537.13889916,
        3099.75121966,  655.2848398 , 2687.48648381, 2302.71838396,
         585.67463419, 8652.97166708, 8620.93678941,  943.37266194],
       [1472.61090534, 1395.4895971 ,   89.0778098 , 3200.84494385,
        2749.72104765,  721.7836647 , 1835.99400007, 1938.17003599,
         444.13125476, 8613.37020561, 8767.7

In [36]:
predictions.columns

Index(['id_1000', 'id_1000_loc', 'id_1000_scale', 'id_1001', 'id_1001_loc',
       'id_1001_scale', 'id_1003', 'id_1003_loc', 'id_1003_scale', 'id_1004',
       'id_1004_loc', 'id_1004_scale'],
      dtype='object')

In [37]:
from skforecast.direct import ForecasterDirect
from sklearn.linear_model import LinearRegression

forecaster = ForecasterDirect(LinearRegression(), lags=3, steps=5)
forecaster.fit(y=pd.Series(np.arange(50)))
last_window = pd.Series(data  = [47, 48, 49], 
                        index = pd.RangeIndex(start=47, stop=50, step=1), name='y')
last_window = pd.Series(data  = [47, 48, 49], 
                        index = pd.RangeIndex(start=47, stop=50, step=1), name='y').to_frame()
results = forecaster.predict(steps=[1, 2, 3, 4], last_window=last_window)

expected = pd.Series(
                data  = np.array([50., 51., 52., 53.]),
                index = pd.RangeIndex(start=50, stop=54, step=1),
                name  = 'pred'
            )

pd.testing.assert_series_equal(results, expected)

In [38]:
last_window

Unnamed: 0,y
47,47
48,48
49,49


In [44]:
forecaster = ForecasterDirectMultiVariate(
                    regressor          = Ridge(random_state=123),
                    level              = 'l1',
                    lags               = {'l1': 2, 'l2': [1, 3]},
                    steps              = 8,
                    transformer_series = None
                )

cv = TimeSeriesFold(
        initial_train_size = len(series) - 20,
        steps              = 5,
        gap                = 3,
        refit              = False,
        fixed_train_size   = False,
    )

metrics_levels, backtest_predictions = backtesting_forecaster_multiseries(
                                            forecaster              = forecaster,
                                            series                  = series,
                                            cv                      = cv,
                                            levels                  = 'l1',
                                            metric                  = 'mean_absolute_error',
                                            add_aggregated_metric   = False,
                                            exog                    = series['l1'].rename('exog_1'),
                                            interval                = norm,
                                            n_boot                  = 150,
                                            random_state            = 123,
                                            use_in_sample_residuals = True,
                                            verbose                 = False
                                        )

  0%|          | 0/4 [00:00<?, ?it/s]

In [45]:
metrics_levels

Unnamed: 0,levels,mean_absolute_error
0,l1,0.117919


In [46]:
backtest_predictions.to_numpy()

array([[0.55880533, 0.55863664, 0.11294946],
       [0.46285725, 0.46532544, 0.11662733],
       [0.35358667, 0.34633493, 0.10823561],
       [0.44404948, 0.43289758, 0.10734709],
       [0.64659616, 0.65154034, 0.10756972],
       [0.70306475, 0.70289606, 0.11294946],
       [0.48677757, 0.48924576, 0.11662733],
       [0.49848981, 0.49123807, 0.10823561],
       [0.31544893, 0.30429703, 0.10734709],
       [0.4450306 , 0.44997478, 0.10756972],
       [0.50164877, 0.50148008, 0.11294946],
       [0.62883248, 0.63130067, 0.11662733],
       [0.33387601, 0.32662427, 0.10823561],
       [0.45961408, 0.44846217, 0.10734709],
       [0.63726975, 0.64221393, 0.10756972],
       [0.54013414, 0.53996545, 0.11294946],
       [0.52550978, 0.52797797, 0.11662733]])

In [48]:
backtest_predictions.columns

Index(['l1', 'l1_loc', 'l1_scale'], dtype='object')