In [8]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

c:\Users\jaesc2\GitHub\skforecast


In [9]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

from skforecast.datasets import fetch_dataset
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import TimeSeriesFold
from skforecast.model_selection import backtesting_forecaster_multiseries
from skforecast.model_selection import grid_search_forecaster_multiseries
from skforecast.model_selection import bayesian_search_forecaster_multiseries

In [10]:
# Data download
# ==============================================================================
data = fetch_dataset(name="items_sales")
data.head()

items_sales
-----------
Simulated time series for the sales of 3 different items.
Simulated data.
Shape of the dataset: (1097, 3)


Unnamed: 0_level_0,item_1,item_2,item_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-01,8.253175,21.047727,19.429739
2012-01-02,22.777826,26.578125,28.009863
2012-01-03,27.549099,31.751042,32.078922
2012-01-04,25.895533,24.567708,27.252276
2012-01-05,21.379238,18.191667,20.357737


In [14]:
# Split data into train-val-test
# ==============================================================================
end_train = '2014-10-15 23:59:00'
data_train = data.loc[:end_train, :].copy()
data_test  = data.loc[end_train:, :].copy()

print(
    f"Train dates : {data_train.index.min()} --- {data_train.index.max()}   "
    f"(n={len(data_train)})"
)
print(
    f"Test dates  : {data_test.index.min()} --- {data_test.index.max()}   "
    f"(n={len(data_test)})"
)

Train dates : 2012-01-01 00:00:00 --- 2014-10-15 00:00:00   (n=1019)
Test dates  : 2014-10-16 00:00:00 --- 2015-01-01 00:00:00   (n=78)


In [15]:
# Create and train ForecasterRecursiveMultiSeries
# ==============================================================================
forecaster = ForecasterRecursiveMultiSeries(
                 regressor          = LGBMRegressor(random_state=123, verbose=-1),
                 lags               = 24,
                 window_features    = RollingFeatures(stats=['mean', 'mean'], window_sizes=[24, 48]),
                 encoding           = 'ordinal',
                 transformer_series = None,
                 transformer_exog   = None,
                 weight_func        = None,
                 series_weights     = None,
                 differentiation    = None,
                 dropna_from_series = False,
                 fit_kwargs         = None,
                 forecaster_id      = None
             )

forecaster.fit(series=data_train)
forecaster

In [17]:
# Backtesting multiple time series
# ==============================================================================
cv = TimeSeriesFold(
         steps                 = 24,
         initial_train_size    = len(data_train),
         refit                 = True,
         fixed_train_size      = True,
         allow_incomplete_fold = True
     )

metrics_levels, backtest_predictions = backtesting_forecaster_multiseries(
    forecaster            = forecaster,
    series                = data,
    exog                  = None,
    cv                    = cv,
    levels                = None,
    metric                = 'mean_absolute_error',
    add_aggregated_metric = True,
    interval              = [10, 90],
    n_boot                = 20,
    n_jobs                = 'auto',
    verbose               = False,
    show_progress         = True,
    suppress_warnings     = False
)

# print("Backtest metrics")
# display(metrics_levels)
# print("")
print("Backtest predictions")
backtest_predictions.head(4)

  0%|          | 0/4 [00:00<?, ?it/s]

Backtest metrics


Unnamed: 0,levels,mean_absolute_error
0,item_1,1.270665
1,item_2,3.044588
2,item_3,3.284487
3,average,2.533247
4,weighted_average,2.533247
5,pooling,2.533247



Backtest predictions


Unnamed: 0,item_1,item_2,item_3,item_1_lower_bound,item_1_upper_bound,item_2_lower_bound,item_2_upper_bound,item_3_lower_bound,item_3_upper_bound
2014-10-16,22.67452,13.485188,13.974291,21.838102,23.681646,11.983631,14.846301,12.009599,16.101338
2014-10-17,22.645121,14.346879,16.488286,21.429285,24.850045,12.781281,16.410407,14.615506,19.45891
2014-10-18,20.360174,13.733121,15.011027,19.393813,22.472069,11.609492,14.852504,12.787747,19.090429
2014-10-19,18.583785,14.211755,16.790619,17.870115,20.486787,12.382525,15.621007,14.93501,19.68976


In [18]:
backtest_predictions.loc[:, backtest_predictions.columns.sort_values()]

Unnamed: 0,item_1,item_1_lower_bound,item_1_upper_bound,item_2,item_2_lower_bound,item_2_upper_bound,item_3,item_3_lower_bound,item_3_upper_bound
2014-10-16,22.674520,21.838102,23.681646,13.485188,11.983631,14.846301,13.974291,12.009599,16.101338
2014-10-17,22.645121,21.429285,24.850045,14.346879,12.781281,16.410407,16.488286,14.615506,19.458910
2014-10-18,20.360174,19.393813,22.472069,13.733121,11.609492,14.852504,15.011027,12.787747,19.090429
2014-10-19,18.583785,17.870115,20.486787,14.211755,12.382525,15.621007,16.790619,14.935010,19.689760
2014-10-20,20.189672,19.008449,21.651636,13.600293,12.140310,15.508134,14.520306,12.952375,19.085890
...,...,...,...,...,...,...,...,...,...
2014-12-28,19.081028,18.299598,20.628435,19.727952,17.519069,21.502977,23.261802,20.440830,26.392094
2014-12-29,19.741199,18.801021,21.693539,20.135626,17.073642,21.565278,23.177077,21.029047,25.868686
2014-12-30,21.125134,19.704002,22.656234,19.995903,17.720189,21.620954,20.074344,18.010999,22.488854
2014-12-31,19.808572,18.959000,21.583298,17.940938,16.203238,20.154150,18.802474,17.017533,22.453136


In [24]:
from joblib import Parallel, delayed

# Función que será paralelizada
def compute_task(x):
    # Crear un DataFrame con datos basados en x
    df = pd.DataFrame({
        'A': np.random.randint(0, 100, size=5),
        'B': np.random.randint(0, 100, size=5)
    })
    # Crear una lista basada en x
    lst = [x * i for i in range(5)]
    return df, lst


# Número de tareas paralelas
num_tasks = 4

# Ejecutar en paralelo
results = Parallel(n_jobs=-1)(
    delayed(compute_task)(x) for x in range(num_tasks)
)

# Separar los DataFrames y las listas
dataframes = [result[0] for result in results]
lists = [result[1] for result in results]

# Combinar los DataFrames en uno solo
combined_df = pd.concat(dataframes, ignore_index=True)

# Crear un set con todos los valores únicos de las listas
unique_values = set()
for lst in lists:
    unique_values.update(lst)

# Imprimir resultados
print("Combined DataFrame:")
# print(combined_df)
print("\nLists:")
print(lists)
print("\nUnique values from all lists:")
print(unique_values)

Combined DataFrame:

Lists:
[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], [0, 3, 6, 9, 12]]

Unique values from all lists:
{0, 1, 2, 3, 4, 6, 8, 9, 12}


In [42]:
# Data download
# ==============================================================================
data_raw = fetch_dataset(name="h2o", raw=True)
data_raw.head()

h2o
---
Monthly expenditure ($AUD) on corticosteroid drugs that the Australian health
system had between 1991 and 2008.
Hyndman R (2023). fpp3: Data for Forecasting: Principles and Practice(3rd
Edition). http://pkg.robjhyndman.com/fpp3package/,https://github.com/robjhyndman
/fpp3package, http://OTexts.com/fpp3.
Shape of the dataset: (204, 2)


Unnamed: 0,x,fecha
0,0.429795,1991-07-01
1,0.400906,1991-08-01
2,0.432159,1991-09-01
3,0.492543,1991-10-01
4,0.502369,1991-11-01


In [45]:
data = data_raw.copy()
data['fecha'] = pd.to_datetime(data['fecha'])
data = data.set_index('fecha')
data.index

DatetimeIndex(['1991-07-01', '1991-08-01', '1991-09-01', '1991-10-01',
               '1991-11-01', '1991-12-01', '1992-01-01', '1992-02-01',
               '1992-03-01', '1992-04-01',
               ...
               '2007-09-01', '2007-10-01', '2007-11-01', '2007-12-01',
               '2008-01-01', '2008-02-01', '2008-03-01', '2008-04-01',
               '2008-05-01', '2008-06-01'],
              dtype='datetime64[ns]', name='fecha', length=204, freq=None)

In [46]:
data = data.resample('MS')
data

<pandas.core.resample.DatetimeIndexResampler object at 0x0000022457B718D0>

In [47]:
data = data_raw.copy()
data['fecha'] = pd.to_datetime(data['fecha'])
data = data.set_index('fecha')
data = data.resample('MS').mean()
data

Unnamed: 0_level_0,x
fecha,Unnamed: 1_level_1
1991-07-01,0.429795
1991-08-01,0.400906
1991-09-01,0.432159
1991-10-01,0.492543
1991-11-01,0.502369
...,...
2008-02-01,0.761822
2008-03-01,0.649435
2008-04-01,0.827887
2008-05-01,0.816255
