In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

c:\Users\Joaquín Amat\Documents\GitHub\skforecast


In [2]:
import re
import pytest
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from skforecast.recursive import ForecasterRecursiveMultiSeries

In [3]:
def custom_weights(index):  # pragma: no cover
    """
    Return 0 if index is between '2022-01-08' and '2022-01-10', 1 otherwise.
    """
    weights = np.where((index >= "2022-01-08") & (index <= "2022-01-10"), 0, 1)

    return weights


def custom_weights_2(index):  # pragma: no cover
    """
    Return 2 if index is between '2022-01-11' and '2022-01-13', 3 otherwise.
    """
    weights = np.where((index >= "2022-01-11") & (index <= "2022-01-13"), 2, 3)

    return weights


def custom_weights_nan(index):  # pragma: no cover
    """
    Return np.nan if index is between '2022-01-08' and '2022-01-10', 1 otherwise.
    """
    weights = np.where((index >= "2022-01-08") & (index <= "2022-01-10"), np.nan, 1)

    return weights


def custom_weights_negative(index):  # pragma: no cover
    """
    Return -1 if index is between '2022-01-08' and '2022-01-10', 1 otherwise.
    """
    weights = np.where((index >= "2022-01-08") & (index <= "2022-01-10"), -1, 1)

    return weights


series = pd.DataFrame(
    data=np.array(
        [
            [0.12362923, 0.51328688],
            [0.65138268, 0.11599708],
            [0.58142898, 0.72350895],
            [0.72969992, 0.10305721],
            [0.97790567, 0.20581485],
            [0.56924731, 0.41262027],
            [0.85369084, 0.82107767],
            [0.75425194, 0.0107816],
            [0.08167939, 0.94951918],
            [0.00249297, 0.55583355],
        ]
    ),
    columns=["series_1", "series_2"],
    index=pd.DatetimeIndex(
        [
            "2022-01-04",
            "2022-01-05",
            "2022-01-06",
            "2022-01-07",
            "2022-01-08",
            "2022-01-09",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
        ],
        dtype="datetime64[ns]",
        freq="D",
    ),
)

X_train_onehot = pd.DataFrame(
    data=np.array(
        [
            [0.58142898, 0.65138268, 0.12362923, 1.0, 0.0],
            [0.72969992, 0.58142898, 0.65138268, 1.0, 0.0],
            [0.97790567, 0.72969992, 0.58142898, 1.0, 0.0],
            [0.56924731, 0.97790567, 0.72969992, 1.0, 0.0],
            [0.85369084, 0.56924731, 0.97790567, 1.0, 0.0],
            [0.75425194, 0.85369084, 0.56924731, 1.0, 0.0],
            [0.08167939, 0.75425194, 0.85369084, 1.0, 0.0],
            [0.72350895, 0.11599708, 0.51328688, 0.0, 1.0],
            [0.10305721, 0.72350895, 0.11599708, 0.0, 1.0],
            [0.20581485, 0.10305721, 0.72350895, 0.0, 1.0],
            [0.41262027, 0.20581485, 0.10305721, 0.0, 1.0],
            [0.82107767, 0.41262027, 0.20581485, 0.0, 1.0],
            [0.0107816, 0.82107767, 0.41262027, 0.0, 1.0],
            [0.94951918, 0.0107816, 0.82107767, 0.0, 1.0],
        ]
    ),
    columns=["lag_1", "lag_2", "lag_3", "series_1", "series_2"],
    index=pd.DatetimeIndex(
        [
            "2022-01-07",
            "2022-01-08",
            "2022-01-09",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
            "2022-01-07",
            "2022-01-08",
            "2022-01-09",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
        ],
        dtype="datetime64[ns]",
        freq=None,
    ),
)

X_train_ordinal = pd.DataFrame(
    data=np.array(
        [
            [0.58142898, 0.65138268, 0.12362923, 0.0],
            [0.72969992, 0.58142898, 0.65138268, 0.0],
            [0.97790567, 0.72969992, 0.58142898, 0.0],
            [0.56924731, 0.97790567, 0.72969992, 0.0],
            [0.85369084, 0.56924731, 0.97790567, 0.0],
            [0.75425194, 0.85369084, 0.56924731, 0.0],
            [0.08167939, 0.75425194, 0.85369084, 0.0],
            [0.72350895, 0.11599708, 0.51328688, 1.0],
            [0.10305721, 0.72350895, 0.11599708, 1.0],
            [0.20581485, 0.10305721, 0.72350895, 1.0],
            [0.41262027, 0.20581485, 0.10305721, 1.0],
            [0.82107767, 0.41262027, 0.20581485, 1.0],
            [0.0107816, 0.82107767, 0.41262027, 1.0],
            [0.94951918, 0.0107816, 0.82107767, 1.0],
        ]
    ),
    columns=["lag_1", "lag_2", "lag_3", "_level_skforecast"],
    index=pd.DatetimeIndex(
        [
            "2022-01-07",
            "2022-01-08",
            "2022-01-09",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
            "2022-01-07",
            "2022-01-08",
            "2022-01-09",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
        ],
        dtype="datetime64[ns]",
        freq=None,
    ),
)

X_train_ordinal_category = X_train_ordinal.copy()
X_train_ordinal_category["_level_skforecast"] = X_train_ordinal_category[
    "_level_skforecast"
].astype("category")

X_train_onehot_diferent_length = pd.DataFrame(
    data=np.array(
        [
            [0.58142898, 0.65138268, 0.12362923, 1.0, 0.0],
            [0.72969992, 0.58142898, 0.65138268, 1.0, 0.0],
            [0.97790567, 0.72969992, 0.58142898, 1.0, 0.0],
            [0.56924731, 0.97790567, 0.72969992, 1.0, 0.0],
            [0.85369084, 0.56924731, 0.97790567, 1.0, 0.0],
            [0.75425194, 0.85369084, 0.56924731, 1.0, 0.0],
            [0.08167939, 0.75425194, 0.85369084, 1.0, 0.0],
            [0.41262027, 0.20581485, 0.10305721, 0.0, 1.0],
            [0.82107767, 0.41262027, 0.20581485, 0.0, 1.0],
            [0.0107816, 0.82107767, 0.41262027, 0.0, 1.0],
            [0.94951918, 0.0107816, 0.82107767, 0.0, 1.0],
        ]
    ),
    columns=["lag_1", "lag_2", "lag_3", "series_1", "series_2"],
    index=pd.DatetimeIndex(
        [
            "2022-01-07",
            "2022-01-08",
            "2022-01-09",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
        ],
        dtype="datetime64[ns]",
        freq=None,
    ),
)

X_train_ordinal_diferent_length = pd.DataFrame(
    data=np.array(
        [
            [0.58142898, 0.65138268, 0.12362923, 0.0],
            [0.72969992, 0.58142898, 0.65138268, 0.0],
            [0.97790567, 0.72969992, 0.58142898, 0.0],
            [0.56924731, 0.97790567, 0.72969992, 0.0],
            [0.85369084, 0.56924731, 0.97790567, 0.0],
            [0.75425194, 0.85369084, 0.56924731, 0.0],
            [0.08167939, 0.75425194, 0.85369084, 0.0],
            [0.41262027, 0.20581485, 0.10305721, 1.0],
            [0.82107767, 0.41262027, 0.20581485, 1.0],
            [0.0107816, 0.82107767, 0.41262027, 1.0],
            [0.94951918, 0.0107816, 0.82107767, 1.0],
        ]
    ),
    columns=["lag_1", "lag_2", "lag_3", "_level_skforecast"],
    index=pd.DatetimeIndex(
        [
            "2022-01-07",
            "2022-01-08",
            "2022-01-09",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
            "2022-01-10",
            "2022-01-11",
            "2022-01-12",
            "2022-01-13",
        ],
        dtype="datetime64[ns]",
        freq=None,
    ),
)

X_train_ordinal_category_diferent_length = X_train_ordinal_diferent_length.copy()
X_train_ordinal_category_diferent_length["_level_skforecast"] = (
    X_train_ordinal_category_diferent_length["_level_skforecast"].astype("category")
)

In [4]:
@pytest.mark.parametrize(
    "weight_func, expected",
    [
        (
            {"series_1": custom_weights},
            np.array(
                [
                    1.0,
                    0.0,
                    0.0,
                    0.0,
                    1.0,
                    1.0,
                    1.0,
                    1.0,
                    1.0,
                    1.0,
                    1.0,
                ]
            ),
        ),
        (
            {"series_2": custom_weights_2},
            np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 2.0, 2.0, 2.0]),
        ),
        (
            {"series_1": custom_weights, "series_2": custom_weights_2},
            np.array([1, 0, 0, 0, 1, 1, 1, 3, 2, 2, 2]),
        ),
    ],
    ids=lambda values: f"levels: {values}",
)
def test_create_sample_weights_output_using_weight_func_dict_different_series_lengths(
    weight_func, expected
):
    """
    Test `sample_weights` creation using `weight_func` with series of different lengths.
    """
    forecaster = ForecasterRecursiveMultiSeries(
                     regressor          = LinearRegression(),
                     lags               = 3,
                     encoding           = "ordinal",
                     transformer_series = StandardScaler(),
                     weight_func        = weight_func
                 )
    forecaster.encoding_mapping_ = {"series_1": 0, "series_2": 1}
    results = forecaster.create_sample_weights(
        series_names_in_=["series_1", "series_2"],
        X_train=X_train_ordinal_diferent_length,
    )

    assert np.array_equal(results, expected)

In [5]:
series = pd.DataFrame({'l1': pd.Series(np.arange(10)), 
                       'l2': pd.Series(np.arange(10))})

In [6]:
series_train = {
    'l1': pd.Series(
        np.array([-1.42382504,  1.26372846, -0.87066174, -0.25917323, -0.07534331,
                    -0.74088465, -1.3677927 ,  0.6488928 ,  0.36105811, -1.95286306,
                    2.34740965,  0.96849691, -0.75938718,  0.90219827, -0.46695317,
                    -0.06068952,  0.78884434, -1.25666813,  0.57585751,  1.39897899]),
        index = pd.date_range(start='1-1-2018', periods=20, freq='D')
    ),
    'l2': pd.Series(
        np.array([1.32229806, -0.29969852,  0.90291934, -1.62158273, -0.15818926,
                    0.44948393, -1.34360107, -0.08168759,  1.72473993,  2.61815943,
                    0.77736134,  0.8286332 , -0.95898831, -1.20938829, -1.41229201,
                    0.54154683,  0.7519394 , -0.65876032, -1.22867499,  0.25755777]),
        index = pd.date_range(start='1-1-2018', periods=20, freq='D')
    )
}
y_true  = {
    'l1': np.array([ 0.31290292, -0.13081169,  1.26998312, -0.09296246, -0.06615089]),
    'l2': np.array([-1.10821447,  0.13595685,  1.34707776,  0.06114402,  0.0709146 ])
}
y_pred = {
    'l1': np.array([0.43365454, 0.27748366, 0.53025239, 0.53672097, 0.61835001]),
    'l2': np.array([-0.79501746,  0.30003095, -1.60270159,  0.26679883, -1.26162378])
}

forecaster = ForecasterRecursiveMultiSeries(
                    regressor          = LinearRegression(),
                    lags               = 5,
                    transformer_series = StandardScaler(),
                    differentiation    = 1,
                    encoding=None
                )
forecaster.fit(series=series_train)
forecaster.set_out_sample_residuals(
    y_true = y_true,
    y_pred = y_pred
)

y_true['_unknown_level'] = np.concatenate([y_true['l2'], y_true['l1']])
y_true['l1'] = forecaster.transformer_series_['l1'].transform(y_true['l1'].reshape(-1, 1)).flatten()
y_true['l2'] = forecaster.transformer_series_['l2'].transform(y_true['l2'].reshape(-1, 1)).flatten()
y_pred['l1'] = forecaster.transformer_series_['l1'].transform(y_pred['l1'].reshape(-1, 1)).flatten()
y_pred['l2'] = forecaster.transformer_series_['l2'].transform(y_pred['l2'].reshape(-1, 1)).flatten()
y_true['_unknown_level'] = forecaster.transformer_series_['_unknown_level'].transform(y_true['_unknown_level'].reshape(-1, 1)).flatten()
y_true['l1'] = forecaster.differentiator_['l1'].transform(y_true['l1'])[forecaster.differentiation_max:]
y_true['l2'] = forecaster.differentiator_['l2'].transform(y_true['l2'])[forecaster.differentiation_max:]
y_pred['l1'] = forecaster.differentiator_['l1'].transform(y_pred['l1'])[forecaster.differentiation_max:]
y_pred['l2'] = forecaster.differentiator_['l2'].transform(y_pred['l2'])[forecaster.differentiation_max:]
y_true['_unknown_level'] = forecaster.differentiator_['_unknown_level'].transform(y_true['_unknown_level'])[forecaster.differentiation_max:]
residuals = {}
residuals['l1'] = y_true['l1'] - y_pred['l1']
residuals['l2'] = y_true['l2'] - y_pred['l2']
residuals['_unknown_level'] = y_true['_unknown_level'] - np.concatenate([y_pred['l2'], y_pred['l1']])



ValueError: zero-dimensional arrays cannot be concatenated

In [47]:
for key in residuals.keys():
    print(f"key: {key}")
    np.testing.assert_array_almost_equal(residuals[key], forecaster.out_sample_residuals_[key])


key: l1
key: l2


In [41]:
np.sort(np.concatenate([(y_true['l2'] - y_pred['l2']), (y_true['l1'] - y_pred['l1'])]))

array([-0.6845009 , -0.62968343, -0.40829535, -0.31319701, -0.20565481,
       -0.1640741 , -0.12075162,  0.73973073,  1.33253838,  2.94977935])

In [42]:
y_true['l1'] - y_pred['l1']

array([-0.12075162, -0.40829535,  0.73973073, -0.62968343, -0.6845009 ])

In [43]:
y_true['l2'] - y_pred['l2']

array([-0.31319701, -0.1640741 ,  2.94977935, -0.20565481,  1.33253838])

In [44]:
residuals['_unknown_level']

array([-0.31319701, -0.1640741 ,  2.94977935, -0.20565481,  1.33253838,
       -0.12075162, -0.40829535,  0.73973073, -0.62968343, -0.6845009 ])

In [12]:
series_train = {
    'l1': pd.Series(
        np.array([-1.42382504,  1.26372846, -0.87066174, -0.25917323, -0.07534331,
                    -0.74088465, -1.3677927 ,  0.6488928 ,  0.36105811, -1.95286306,
                    2.34740965,  0.96849691, -0.75938718,  0.90219827, -0.46695317,
                    -0.06068952,  0.78884434, -1.25666813,  0.57585751,  1.39897899]),
        index = pd.date_range(start='1-1-2018', periods=20, freq='D')
    ),
    'l2': pd.Series(
        np.array([1.32229806, -0.29969852,  0.90291934, -1.62158273, -0.15818926,
                    0.44948393, -1.34360107, -0.08168759,  1.72473993,  2.61815943,
                    0.77736134,  0.8286332 , -0.95898831, -1.20938829, -1.41229201,
                    0.54154683,  0.7519394 , -0.65876032, -1.22867499,  0.25755777]),
        index = pd.date_range(start='1-1-2018', periods=20, freq='D')
    )
}
y_true  = {
    'l1': np.array([ 0.31290292, -0.13081169,  1.26998312, -0.09296246, -0.06615089]),
    'l2': np.array([-1.10821447,  0.13595685,  1.34707776,  0.06114402,  0.0709146 ])
}
y_pred = {
    'l1': np.array([0.43365454, 0.27748366, 0.53025239, 0.53672097, 0.61835001]),
    'l2': np.array([-0.79501746,  0.30003095, -1.60270159,  0.26679883, -1.26162378])
}

forecaster = ForecasterRecursiveMultiSeries(
                    regressor          = LinearRegression(),
                    lags               = 5,
                    transformer_series = StandardScaler(),
                    differentiation    = 1,
                    encoding=None
                )
forecaster.fit(series=series_train)
forecaster.set_out_sample_residuals(
    y_true = y_true,
    y_pred = y_pred
)

forecaster.out_sample_residuals_



ValueError: zero-dimensional arrays cannot be concatenated

In [8]:
forecaster.binner

{'_unknown_level': <skforecast.preprocessing.preprocessing.QuantileBinner at 0x1aa35ae5790>}

In [9]:
forecaster.differentiator_

{'l1': TimeSeriesDifferentiator(order=1, window_size=6),
 'l2': TimeSeriesDifferentiator(order=1, window_size=6),
 '_unknown_level': TimeSeriesDifferentiator(order=1, window_size=6)}

In [10]:
forecaster.transformer_series_

{'_unknown_level': StandardScaler()}

In [22]:
forecaster.series_names_in_

['l1', 'l2']

In [11]:
self = forecaster
np.concatenate(list(self.out_sample_residuals_.values()))

array([-0.25863534,  1.03260854, -1.23173923, -0.04930636,  0.33397462,
        0.13413074,  2.8008002 , -2.83820057,  1.38354995,  0.13413074,
        2.8008002 , -2.83820057,  1.38354995, -0.25863534,  1.03260854,
       -1.23173923, -0.04930636])