In [24]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
c:\Users\Joaquín Amat\Documents\GitHub\skforecast


In [25]:
import re
import pytest
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor

from skforecast.exceptions import IgnoredArgumentWarning
from skforecast.exceptions import UnknownLevelWarning
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries

In [32]:
series = pd.DataFrame(
             {'1': pd.Series(np.array(
                       [0.69646919, 0.28613933, 0.22685145, 0.55131477, 0.71946897,
                        0.42310646, 0.9807642 , 0.68482974, 0.4809319 , 0.39211752,
                        0.34317802, 0.72904971, 0.43857224, 0.0596779 , 0.39804426,
                        0.73799541, 0.18249173, 0.17545176, 0.53155137, 0.53182759,
                        0.63440096, 0.84943179, 0.72445532, 0.61102351, 0.72244338,
                        0.32295891, 0.36178866, 0.22826323, 0.29371405, 0.63097612,
                        0.09210494, 0.43370117, 0.43086276, 0.4936851 , 0.42583029,
                        0.31226122, 0.42635131, 0.89338916, 0.94416002, 0.50183668,
                        0.62395295, 0.1156184 , 0.31728548, 0.41482621, 0.86630916,
                        0.25045537, 0.48303426, 0.98555979, 0.51948512, 0.61289453])
                   ), 
              '2': pd.Series(np.array(
                       [0.12062867, 0.8263408 , 0.60306013, 0.54506801, 0.34276383,
                        0.30412079, 0.41702221, 0.68130077, 0.87545684, 0.51042234,
                        0.66931378, 0.58593655, 0.6249035 , 0.67468905, 0.84234244,
                        0.08319499, 0.76368284, 0.24366637, 0.19422296, 0.57245696,
                        0.09571252, 0.88532683, 0.62724897, 0.72341636, 0.01612921,
                        0.59443188, 0.55678519, 0.15895964, 0.15307052, 0.69552953,
                        0.31876643, 0.6919703 , 0.55438325, 0.38895057, 0.92513249,
                        0.84167   , 0.35739757, 0.04359146, 0.30476807, 0.39818568,
                        0.70495883, 0.99535848, 0.35591487, 0.76254781, 0.59317692,
                        0.6917018 , 0.15112745, 0.39887629, 0.2408559 , 0.34345601])
                   )
             }
         )

def expected_df_to_long_format(
    df: pd.DataFrame, method: str = "predict"
) -> pd.DataFrame:
    """
    Convert DataFrame with predictions (one column per level) to long format.
    """

    if method == "predict":
        df = (
            df.melt(var_name="level", value_name="pred", ignore_index=False)
            .reset_index()
            .sort_values(by=["index", "level"])
            .set_index("index")
            .rename_axis(None, axis=0)
        )
    elif method == "bootstrapping":
        df = (
            pd.concat([value.assign(level=key) for key, value in df.items()])
            .reset_index()
            .sort_values(by=["index", "level"])
            .set_index("index")
            .rename_axis(None, axis=0)
        )
        df = df[
            ["level"] + [col for col in df.columns if col not in ["level", "index"]]
        ]
        if isinstance(df.index, pd.DatetimeIndex) and df.index.freq is not None:
            df.index.freq = None
    elif method == "interval":
        df = df.melt(var_name="level", value_name="pred", ignore_index=False).reset_index()
        df['level_aux'] = df['level'].str.replace(r'_lower_bound|_upper_bound', '', regex=True)
        df['bound_type'] = df['level'].str.extract(r'(lower_bound|upper_bound)$', expand=False).fillna('pred')

        df = (
            df.pivot_table(index=["index", "level_aux"], columns="bound_type", values="pred")
            .reset_index()
            .sort_values(by=["index", "level_aux"])
            .set_index("index")
            .rename_axis(None, axis=0)
            .rename_axis(None, axis=1)
            .rename(columns={"level_aux": "level"})
            [['level', 'pred', 'lower_bound', 'upper_bound']]
        )

    return df

In [33]:
forecaster = ForecasterRecursiveMultiSeries(
    LGBMRegressor(verbose=-1), lags=3, encoding='ordinal'
)
forecaster.fit(series=series)
last_window = pd.DataFrame(forecaster.last_window_)
last_window['3'] = last_window['1']

warn_msg = re.escape(
    "`levels` {'3'} were not included in training. "
    "Unknown levels are encoded as NaN, which may cause the "
    "prediction to fail if the regressor does not accept NaN values."
)
with pytest.warns(UnknownLevelWarning, match = warn_msg):
    results = forecaster.predict_bootstrapping(
                    steps                   = 1,
                    levels                  = ['1', '2', '3'],
                    last_window             = last_window,
                    n_boot                  = 4,
                    use_in_sample_residuals = True
                )

warn_msg = re.escape(
    "`levels` {'3'} are not present in `forecaster.in_sample_residuals_`, "
    "most likely because they were not present in the training data. "
    "A random sample of the residuals from other levels will be used. "
    "This can lead to inaccurate intervals for the unknown levels."
)
with pytest.warns(UnknownLevelWarning, match = warn_msg):
    results = forecaster.predict_bootstrapping(
                    steps                   = 1,
                    levels                  = ['1', '2', '3'],
                    last_window             = last_window,
                    n_boot                  = 4,
                    use_in_sample_residuals = True
                )

expected = {
    '1': pd.DataFrame(
                data    = np.array([[0.60487599, 0.22612695, 0.07322577, 0.45340699]]),
                columns = [f"pred_boot_{i}" for i in range(4)],
                index   = pd.RangeIndex(50, 51)
            ),
    '2': pd.DataFrame(
                data    = np.array([[0.69343437, 0.67766497, 0.69444735, 0.64415095]]),
                columns = [f"pred_boot_{i}" for i in range(4)],
                index   = pd.RangeIndex(50, 51)
            ),
    '3': pd.DataFrame(
                data    = np.array([[0.26291992, 0.45547833, 0.22612695, 0.51589959]]),
                columns = [f"pred_boot_{i}" for i in range(4)],
                index   = pd.RangeIndex(50, 51)
            )
}
expected = expected_df_to_long_format(expected, method='bootstrapping')

pd.testing.assert_frame_equal(results, expected)

dict_keys(['1', '2', '_unknown_level', '3'])
dict_keys(['1', '2', '_unknown_level'])
dict_keys(['1', '2', '_unknown_level', '3'])
dict_keys(['1', '2', '_unknown_level'])


