In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

import numpy as np
import pandas as pd

c:\Users\jaesc2\GitHub\skforecast


In [2]:
# Libraries
# ==============================================================================
import pandas as pd
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

from skforecast.datasets import fetch_dataset
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursive, ForecasterRecursiveMultiSeries

import pytest
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from skforecast.preprocessing import RollingFeatures

In [3]:
# Download data
# ==============================================================================
data = fetch_dataset(
    name="h2o", raw=True, kwargs_read_csv={"names": ["y", "datetime"], "header": 0}
)

# Data preprocessing
# ==============================================================================
data['datetime'] = pd.to_datetime(data['datetime'], format='%Y-%m-%d')
data = data.set_index('datetime')
data = data.asfreq('MS')
data = data['y']
data = data.sort_index()

# Split train-test
# ==============================================================================
steps = 36
data_train = data[:-steps]
data_test  = data[-steps:]


h2o
---
Monthly expenditure ($AUD) on corticosteroid drugs that the Australian health
system had between 1991 and 2008.
Hyndman R (2023). fpp3: Data for Forecasting: Principles and Practice(3rd
Edition). http://pkg.robjhyndman.com/fpp3package/,https://github.com/robjhyndman
/fpp3package, http://OTexts.com/fpp3.
Shape of the dataset: (204, 2)


In [5]:
# Create and fit forecaster
# ==============================================================================
forecaster = ForecasterRecursive(
                 regressor       = LGBMRegressor(random_state=123, verbose=-1),
                 lags            = 15,
                 window_features = RollingFeatures(stats=['mean'], window_sizes=10)
             )

forecaster.fit(y=data_train)
forecaster.training_range_

DatetimeIndex(['1991-07-01', '2005-06-01'], dtype='datetime64[ns]', name='datetime', freq=None)

In [24]:
(
X_train,
y_train,
_,
_,
_,
X_train_features_names_out_,
*_
) = forecaster._create_train_X_y(y=data_train)

X_train_features_names_out_

['lag_1',
 'lag_2',
 'lag_3',
 'lag_4',
 'lag_5',
 'lag_6',
 'lag_7',
 'lag_8',
 'lag_9',
 'lag_10',
 'lag_11',
 'lag_12',
 'lag_13',
 'lag_14',
 'lag_15',
 'roll_mean_10']

In [27]:
(
X_train,
y_train,
*_,
a,
_
) = forecaster._create_train_X_y(y=data_train)

X_train_features_names_out_

['lag_1',
 'lag_2',
 'lag_3',
 'lag_4',
 'lag_5',
 'lag_6',
 'lag_7',
 'lag_8',
 'lag_9',
 'lag_10',
 'lag_11',
 'lag_12',
 'lag_13',
 'lag_14',
 'lag_15',
 'roll_mean_10']

In [9]:
from skforecast.utils import preprocess_y

expected_training_range = forecaster.training_range_
training_range = preprocess_y(y=data_train, return_values=False)[1][[0, -1]]

if not expected_training_range.equals(training_range):
    raise AssertionError

In [16]:
expected_training_range[0]

Timestamp('1991-07-01 00:00:00')

In [12]:
preprocess_y(y=data_train, return_values=False)[1][[0, -1]]

DatetimeIndex(['1991-07-01', '2005-06-01'], dtype='datetime64[ns]', name='datetime', freq=None)

In [21]:
predictions = np.array(
    [[1, 10, 100],
    [2, 20, 200],
    [3, 30, 300]], dtype=float
)

lower_bound = np.array(
    [[0.1, 1, 10],
    [0.2, 2, 20],
    [0.3, 3, 30]], dtype=float
)

upper_bound = np.array(
    [[1.1, 11, 101],
    [2.2, 22, 202],
    [3.3, 33, 303]], dtype=float
)

predictions = np.array([predictions, lower_bound, upper_bound], dtype=float).swapaxes(0, 2)
predictions

array([[[1.00e+00, 1.00e-01, 1.10e+00],
        [2.00e+00, 2.00e-01, 2.20e+00],
        [3.00e+00, 3.00e-01, 3.30e+00]],

       [[1.00e+01, 1.00e+00, 1.10e+01],
        [2.00e+01, 2.00e+00, 2.20e+01],
        [3.00e+01, 3.00e+00, 3.30e+01]],

       [[1.00e+02, 1.00e+01, 1.01e+02],
        [2.00e+02, 2.00e+01, 2.02e+02],
        [3.00e+02, 3.00e+01, 3.03e+02]]])

In [23]:
for i in range(3):
    print(predictions[i, :, :])

[[1.  0.1 1.1]
 [2.  0.2 2.2]
 [3.  0.3 3.3]]
[[10.  1. 11.]
 [20.  2. 22.]
 [30.  3. 33.]]
[[100.  10. 101.]
 [200.  20. 202.]
 [300.  30. 303.]]


In [10]:
rng = np.random.default_rng(12345)
series = pd.DataFrame(
    {"1": rng.normal(10, 1, 15_000), "2": rng.normal(10, 1, 15_000)},
    index=pd.date_range(start="2000-01-01", periods=15_000, freq="h"),
)

forecaster_1 = ForecasterRecursiveMultiSeries(
    LinearRegression(), lags=3, encoding='ordinal', binner_kwargs={"n_bins": 3}
)
forecaster_1.fit(series=series, store_in_sample_residuals=True)

In [11]:
forecaster_1.binner_intervals_

{'1': {0: (9.963096392366815, 10.005505419480953),
  1: (10.005505419480953, 10.015571584118915),
  2: (10.015571584118915, 10.058211712775805)},
 '2': {0: (9.940129952794797, 9.993475254986176),
  1: (9.993475254986176, 10.003497948644386),
  2: (10.003497948644386, 10.046188138170931)},
 '_unknown_level': {0: (9.940129952794797, 9.998756219121827),
  1: (9.998756219121827, 10.010351414116016),
  2: (10.010351414116016, 10.058211712775805)}}

In [19]:
rng = np.random.default_rng(1894)
series = pd.DataFrame({"1": rng.normal(10, 5, 20), "2": rng.normal(10, 5, 20)})

rolling = RollingFeatures(stats=["ratio_min_max", "median"], window_sizes=4)
forecaster = ForecasterRecursiveMultiSeries(
    LinearRegression(), lags=3, encoding=None, window_features=rolling, binner_kwargs={"n_bins": 3}
)
forecaster.fit(series=series, store_in_sample_residuals=True)

In [20]:
forecaster.in_sample_residuals_

{'_unknown_level': array([-2.05820541, -5.47429818, -7.31524702, -4.29526858,  0.66810725,
        -5.37727919,  1.94669051,  7.67911837, -0.63572989,  1.79063442,
        -0.48881591,  0.44689235,  4.0759107 , -6.74143431,  2.16887587,
         5.7738583 ,  1.76518089, -6.32949235, -1.56689305,  1.98201019,
         3.56844034,  3.14094834,  7.85637887, -4.32241071,  3.27047182,
        -2.01737457,  1.26390611,  6.83575019, -2.02500966, -1.25537261,
        -2.00844951, -2.32189357])}

In [21]:
forecaster.binner_intervals_

{'_unknown_level': {0: (6.81635531697439, 9.303028105789933),
  1: (9.303028105789933, 10.690051434777626),
  2: (10.690051434777626, 12.789680311772173)}}

In [22]:
rng = np.random.default_rng(1894)
series = pd.DataFrame({"1": rng.normal(10, 5, 20), "2": rng.normal(10, 5, 20)})

forecaster = ForecasterRecursiveMultiSeries(LinearRegression(), lags=3)
forecaster._probabilistic_mode = False
forecaster.fit(series=series, store_in_sample_residuals=False)

In [23]:
forecaster.in_sample_residuals_

{'1': None, '2': None, '_unknown_level': None}

In [24]:
forecaster.in_sample_residuals_by_bin_

{'1': None, '2': None, '_unknown_level': None}

In [25]:
forecaster.binner_intervals_

{}

In [26]:
forecaster = ForecasterRecursiveMultiSeries(
        regressor=LinearRegression(), lags=5, binner_kwargs={"n_bins": 3}
    )

series = pd.DataFrame(
    {
        "l1": pd.Series(np.arange(10)),
        "l2": pd.Series(np.arange(10)),
        "l3": pd.Series(np.arange(10)),
    }
)
series.index = pd.DatetimeIndex(
    [
        "2022-01-04",
        "2022-01-05",
        "2022-01-06",
        "2022-01-07",
        "2022-01-08",
        "2022-01-09",
        "2022-01-10",
        "2022-01-11",
        "2022-01-12",
        "2022-01-13",
    ],
    dtype="datetime64[ns]",
    freq="D",
)
forecaster.fit(series=series, store_in_sample_residuals=False)

rng = np.random.default_rng(12345)
y_pred = rng.normal(100, 15, 20)
y_true = rng.normal(100, 10, 20)
forecaster.in_sample_residuals_ = {"level_1": None}
forecaster.in_sample_residuals_by_bin_ = {"level_1": {}}
forecaster.binner_intervals_ = {"level_1": {}}

forecaster._binning_in_sample_residuals(
    level="level_1",
    y_pred=y_pred,
    y_true=y_true,
)

In [27]:
forecaster.in_sample_residuals_

{'level_1': None}

In [28]:
forecaster.in_sample_residuals_by_bin_

{'level_1': {}}

In [29]:
forecaster.binner_intervals_

{'level_1': {0: (70.70705405481715, 90.25638761254116),
  1: (90.25638761254116, 109.36821559391004),
  2: (109.36821559391004, 135.2111448156828)}}

In [37]:
forecaster = ForecasterRecursiveMultiSeries(
        regressor=LinearRegression(), lags=5, binner_kwargs={"n_bins": 3}
    )

series = pd.DataFrame(
    {
        "l1": pd.Series(np.arange(10)),
        "l2": pd.Series(np.arange(10)),
        "l3": pd.Series(np.arange(10)),
    }
)
series.index = pd.DatetimeIndex(
    [
        "2022-01-04",
        "2022-01-05",
        "2022-01-06",
        "2022-01-07",
        "2022-01-08",
        "2022-01-09",
        "2022-01-10",
        "2022-01-11",
        "2022-01-12",
        "2022-01-13",
    ],
    dtype="datetime64[ns]",
    freq="D",
)
forecaster.fit(series=series)
forecaster.in_sample_residuals_ = None
forecaster.in_sample_residuals_by_bin_ = None
forecaster.binner_intervals_ = {}

rng = np.random.default_rng(12345)
y_pred = rng.normal(100, 15, 20)
y_true = rng.normal(100, 10, 20)

forecaster._probabilistic_mode = "no_binned"
forecaster._binning_in_sample_residuals(
    level="level_1",
    y_pred=y_pred,
    y_true=y_true,
        store_in_sample_residuals=True
)

TypeError: 'NoneType' object does not support item assignment

In [36]:
forecaster.in_sample_residuals_

In [3]:
import re
import pytest
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LinearRegression
from skforecast.recursive import ForecasterRecursiveMultiSeries

# Fixtures
from skforecast.recursive.tests.tests_forecaster_recursive_multiseries.fixtures_forecaster_recursive_multiseries import series, exog

In [8]:
forecaster = ForecasterRecursiveMultiSeries(LinearRegression(), lags=3)
forecaster.fit(series=series, exog=exog['exog_1'])

series_diff_index = series.copy()
series_diff_index.index = pd.RangeIndex(start=50, stop=100)
exog.index = pd.RangeIndex(start=50, stop=100)
series_diff_index_range = series_diff_index['1'].index[[0, -1]]
forecaster.set_in_sample_residuals(series=series_diff_index, exog=exog['exog_1'])

IndexError: The index range for series '1' does not match the range used during training. Please ensure the index is aligned with the training data.
    Expected : Index([0, 49], dtype='int64')
    Received : Index([50, 99], dtype='int64')

In [6]:
series_diff_index

Unnamed: 0,1,2
0,0.696469,0.120629
1,0.286139,0.826341
2,0.226851,0.60306
3,0.551315,0.545068
4,0.719469,0.342764
5,0.423106,0.304121
6,0.980764,0.417022
7,0.68483,0.681301
8,0.480932,0.875457
9,0.392118,0.510422


In [31]:
from sklearn.preprocessing import StandardScaler

# Crear un objeto StandardScaler
scaler = StandardScaler()

# Obtener la referencia de memoria antes de aplicar un método
id_before = id(scaler)

# Aplicar un método, por ejemplo fit()
scaler.fit([[1, 2], [3, 4]])

# Obtener la referencia de memoria después de aplicar el método
id_after = id(scaler)

# Comparar las referencias
if id_before == id_after:
    print("El objeto sigue siendo el mismo.")
else:
    print("El objeto ha cambiado.")


El objeto sigue siendo el mismo.


In [32]:
scaler.__dict__

{'with_mean': True,
 'with_std': True,
 'copy': True,
 'n_features_in_': 2,
 'n_samples_seen_': 2,
 'mean_': array([2., 3.]),
 'var_': array([1., 1.]),
 'scale_': array([1., 1.])}