In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

import numpy as np
import pandas as pd

/home/joaquin/Documents/GitHub/skforecast


In [2]:
from skforecast.utils import get_features_range, check_features_range

In [3]:

# Example data used in the notebook
df = pd.DataFrame()
for i in range(5):
    df[f"feature_{i}"] = np.random.rand(5000)

for i in range(5):
    df[f"cat_feature_{i}"] = np.random.choice(["A", "B", "C"], 5000)

dict_dfs = {'series_1': df, 'series_2': df.iloc[:1000]}

In [4]:
from pprint import pprint
ranges = get_features_range(df)
pprint(ranges)

{'cat_feature_0': {'B', 'A', 'C'},
 'cat_feature_1': {'B', 'A', 'C'},
 'cat_feature_2': {'B', 'A', 'C'},
 'cat_feature_3': {'B', 'A', 'C'},
 'cat_feature_4': {'B', 'A', 'C'},
 'feature_0': (np.float64(2.315605896519024e-05),
               np.float64(0.9999567115696526)),
 'feature_1': (np.float64(7.477701474944976e-05), np.float64(0.99992123302979)),
 'feature_2': (np.float64(4.37062293138224e-05),
               np.float64(0.9993282917950216)),
 'feature_3': (np.float64(9.610341122223698e-05),
               np.float64(0.9998782362887609)),
 'feature_4': (np.float64(2.909597788203211e-05),
               np.float64(0.9999325738578635))}


In [5]:
ranges_dict = get_features_range(dict_dfs)
pprint(ranges_dict)


{'series_1': {'cat_feature_0': {'B', 'A', 'C'},
              'cat_feature_1': {'B', 'A', 'C'},
              'cat_feature_2': {'B', 'A', 'C'},
              'cat_feature_3': {'B', 'A', 'C'},
              'cat_feature_4': {'B', 'A', 'C'},
              'feature_0': (np.float64(2.315605896519024e-05),
                            np.float64(0.9999567115696526)),
              'feature_1': (np.float64(7.477701474944976e-05),
                            np.float64(0.99992123302979)),
              'feature_2': (np.float64(4.37062293138224e-05),
                            np.float64(0.9993282917950216)),
              'feature_3': (np.float64(9.610341122223698e-05),
                            np.float64(0.9998782362887609)),
              'feature_4': (np.float64(2.909597788203211e-05),
                            np.float64(0.9999325738578635))},
 'series_2': {'cat_feature_0': {'B', 'A', 'C'},
              'cat_feature_1': {'B', 'A', 'C'},
              'cat_feature_2': {'B', 'A', 'C'}

In [6]:
new_data_valid = pd.DataFrame({
    'feature_0': [0.1, 0.1, 0.1, 0.1, 0.1],
    'cat_feature_0': ['A', 'B', 'C', 'A', 'B']
})

new_data_invalid = pd.DataFrame({
    'feature_0': [0.1, 0.1, 0.1, 0.1, 10],
    'cat_feature_0': ['A', 'B', 'C', 'A', 'D']
})

new_data_invalid_dict = {'series_1': new_data_invalid, 'series_2': new_data_invalid}

check_features_range(ranges, new_data_valid)
check_features_range(ranges, new_data_invalid)

In [7]:
check_features_range(ranges_dict, new_data_invalid_dict)

In [8]:
# Libraries
# ==============================================================================
import pandas as pd
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from skforecast.datasets import fetch_dataset
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursive
from skforecast.plot import plot_prediction_intervals, set_dark_theme

In [9]:
# Download data
# ==============================================================================
data = fetch_dataset(name='h2o_exog', raw=False)
data.index.name = 'datetime'
# Split data in train and test
# ==============================================================================
steps = 36
data_train = data.iloc[:-steps, :]
data_test  = data.iloc[-steps:, :]

# Create and fit forecaster
# ==============================================================================
forecaster = ForecasterRecursive(
                 regressor       = LGBMRegressor(random_state=123, verbose=-1),
                 lags            = 15,
                 window_features = RollingFeatures(stats=['mean'], window_sizes=10),
                 transformer_y   = None, 
             )

forecaster.fit(y=data_train['y'], exog=data_train.drop(columns='y'))
forecaster

h2o_exog
--------
Monthly expenditure ($AUD) on corticosteroid drugs that the Australian health
system had between 1991 and 2008. Two additional variables (exog_1, exog_2) are
simulated.
Hyndman R (2023). fpp3: Data for Forecasting: Principles and Practice (3rd
Edition). http://pkg.robjhyndman.com/fpp3package/,
https://github.com/robjhyndman/fpp3package, http://OTexts.com/fpp3.
Shape of the dataset: (195, 3)


In [10]:
forecaster.series_values_range_


{'y': (np.float64(0.361801), np.float64(1.257238))}

In [11]:
forecaster.exog_values_range_

{'exog_1': (np.float64(0.949715355), np.float64(1.534778297)),
 'exog_2': (np.float64(1.0679422), np.float64(1.8918816))}

In [12]:
data_test = data_test.copy()
data_test.loc['2006-11-01', 'exog_1'] = 1000
data_test.loc['2006-11-01', 'exog_2'] = -1000
data_test

Unnamed: 0_level_0,y,exog_1,exog_2
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-07-01,0.874336,1.415756,1.446988
2005-08-01,1.006497,1.398304,1.51777
2005-09-01,1.094736,1.386174,1.602616
2005-10-01,1.027043,1.363155,1.668975
2005-11-01,1.149232,1.361009,1.730369
2005-12-01,1.160712,1.417316,1.787644
2006-01-01,1.230691,1.475126,1.832483
2006-02-01,0.587135,1.466789,1.730963
2006-03-01,0.706959,1.46796,1.666946
2006-04-01,0.639641,1.447698,1.565028


In [13]:
check_features_range(
    forecaster.exog_values_range_, 
    data_test.drop(columns='y')
)

In [14]:
forecaster.predict(
    steps=steps,
    exog=data_test.drop(columns='y'),
    last_window=forecaster.last_window_,
    warning_drift=True
)



2005-07-01    1.022492
2005-08-01    1.038417
2005-09-01    1.121040
2005-10-01    1.137967
2005-11-01    1.128166
2005-12-01    1.159866
2006-01-01    1.113893
2006-02-01    0.616014
2006-03-01    0.665318
2006-04-01    0.681968
2006-05-01    0.730368
2006-06-01    0.873993
2006-07-01    1.032972
2006-08-01    1.090603
2006-09-01    1.127225
2006-10-01    1.137967
2006-11-01    1.087979
2006-12-01    1.146875
2007-01-01    1.113893
2007-02-01    0.654316
2007-03-01    0.665181
2007-04-01    0.698342
2007-05-01    0.734199
2007-06-01    0.878402
2007-07-01    1.016443
2007-08-01    1.082752
2007-09-01    1.134024
2007-10-01    1.141134
2007-11-01    1.129810
2007-12-01    1.146875
2008-01-01    1.119187
2008-02-01    0.679257
2008-03-01    0.685721
2008-04-01    0.736172
2008-05-01    0.750875
2008-06-01    0.894247
Freq: MS, Name: pred, dtype: float64

In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

/home/joaquin/Documents/GitHub/skforecast


In [2]:
from skforecast.recursive import ForecasterRecursive, ForecasterRecursiveMultiSeries
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import timeit
import skforecast
import warnings

rng = np.random.default_rng()
y = pd.Series(rng.standard_normal(10_000), index=pd.date_range("2020-01-01", periods=10_000, freq="h"))
exog = pd.DataFrame({
    f"exog_{i}": pd.Series(rng.standard_normal(10_000), index=pd.date_range("2020-01-01", periods=10_000, freq="h"))
    for i in range(50)
})

series = {
    f"series_{i}": pd.Series(rng.standard_normal(1_000), index=pd.date_range("2020-01-01", periods=1_000, freq="h")) for i in range(500)
}

exog_dict = {
    f"series_{i}": pd.DataFrame({
        f"exog_{j}": pd.Series(rng.standard_normal(1_000), index=pd.date_range("2020-01-01", periods=1_000, freq="h")) for j in range(50)
    }) for i in range(500)
}


In [3]:
def run_dummy():
    f = ForecasterRecursive(regressor=DummyRegressor(constant=0.1), lags=5)
    f.fit(y=y, exog=exog)

def run_linear():
    f = ForecasterRecursive(regressor=LinearRegression(), lags=5)
    f.fit(y=y, exog=exog)

t = timeit.Timer(run_dummy)
repeats = t.repeat(repeat=10, number=3)
avg = np.mean(repeats)
std = np.std(repeats)
print(f"Average: {avg:.4f} s, Std: {std:.4f} s")

t = timeit.Timer(run_linear)
repeats = t.repeat(repeat=10, number=3)
avg = np.mean(repeats)
std = np.std(repeats)
print(f"Average: {avg:.4f} s, Std: {std:.4f} s")


Average: 0.0360 s, Std: 0.0063 s
Average: 0.8190 s, Std: 0.2299 s


In [4]:
def run_dummy():
    warnings.simplefilter('ignore', category=skforecast.exceptions.DataTransformationWarning)
    f = ForecasterRecursiveMultiSeries(regressor=DummyRegressor(constant=0.1), lags=5)
    f.fit(series=series, exog=exog_dict, suppress_warnings=True)

def run_linear():
    warnings.simplefilter('ignore', category=skforecast.exceptions.DataTransformationWarning)
    f = ForecasterRecursiveMultiSeries(regressor=LinearRegression(), lags=5)
    f.fit(series=series, exog=exog_dict, suppress_warnings=True)

t = timeit.Timer(run_dummy)
repeats = t.repeat(repeat=5, number=3)
avg = np.mean(repeats)
std = np.std(repeats)
print(f"Average: {avg:.4f} s, Std: {std:.4f} s")

t = timeit.Timer(run_linear)
repeats = t.repeat(repeat=5, number=3)
avg = np.mean(repeats)
std = np.std(repeats)
print(f"Average: {avg:.4f} s, Std: {std:.4f} s")

Average: 9.2105 s, Std: 0.5594 s
Average: 13.6214 s, Std: 1.4057 s


## DOS opciones

+ Poner un argumento en el fit que sea: fit_drift_detector
+ En el initialize, pasar un objeto de clase DriftDetector

In [13]:
from skforecast.utils import DriftDetector
import pandas as pd
import numpy as np

drift_detector = DriftDetector()

df = pd.DataFrame()
for i in range(5):
    df[f"feature_{i}"] = np.random.rand(5000)

for i in range(5):
    df[f"cat_feature_{i}"] = np.random.choice(["A", "B", "C"], 5000)

dict_dfs = {'series_1': df, 'series_2': df.iloc[:1000]}


drift_detector.fit(df)

new_data_valid = pd.DataFrame({
    'feature_0': [0.1, 0.1, 0.1, 0.1, 0.1],
    'cat_feature_0': ['A', 'B', 'C', 'A', 'B']
})

new_data_invalid = pd.DataFrame({
    'feature_0': [0.1, 0.1, 0.1, 0.1, 10],
    'cat_feature_0': ['A', 'B', 'C', 'A', 'D']
})

drift_detector.predict(new_data_invalid)