In [15]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/joaquin/Documents/GitHub/skforecast


In [16]:
from skforecast.recursive import ForecasterRecursive
from sklearn.linear_model import LinearRegression

In [17]:
def _create_lags(
    self,
    y: np.ndarray,
    X_as_pandas: bool = False,
    train_index: pd.Index | None = None
) -> tuple[np.ndarray | pd.DataFrame | None, np.ndarray]:
    """
    Create the lagged values and their target variable from a time series.    
    Note that the returned matrix `X_data` contains the lag 1 in the first 
    column, the lag 2 in the in the second column and so on.
    
    Parameters
    ----------
    y : numpy ndarray
        Training time series values.
    X_as_pandas : bool, default False
        If `True`, the returned matrix `X_data` is a pandas DataFrame.
    train_index : pandas Index, default None
        Index of the training data. It is used to create the pandas DataFrame
        `X_data` when `X_as_pandas` is `True`.

    Returns
    -------
    X_data : numpy ndarray, pandas DataFrame, None
        Lagged values (predictors).
    y_data : numpy ndarray
        Values of the time series related to each row of `X_data`.
    
    """

    X_data = None
    if self.lags is not None:
        n_rows = len(y) - self.window_size
        X_data = np.full(
            shape=(n_rows, len(self.lags)), fill_value=np.nan, order='F', dtype=float
        )
        for i, lag in enumerate(self.lags):
            X_data[:, i] = y[self.window_size - lag: -lag]

        if X_as_pandas:
            X_data = pd.DataFrame(
                         data    = X_data,
                         columns = self.lags_names,
                         index   = train_index
                     )

    y_data = y[self.window_size:]

    return X_data, y_data

In [None]:
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import as_strided

def _create_lags_new(
    self,
    y: np.ndarray,
    X_as_pandas: bool = False,
    train_index: pd.Index | None = None
) -> tuple[np.ndarray | pd.DataFrame | None, np.ndarray]:
    """
    Create the lagged values and their target variable from a time series,
    optimized with stride tricks (no Python loops).
    """
    X_data = None
    if self.lags is not None:
        n_rows = len(y) - self.window_size

        # Rolling window view: shape = (n_rows, window_size)
        y_strided = as_strided(
            y,
            shape=(n_rows, self.window_size),
            strides=(y.strides[0], y.strides[0]),
            writeable=False
        )

        # Column indices corresponding to desired lags
        cols = [self.window_size - lag for lag in self.lags]
        X_data = y_strided[:, cols]

        if X_as_pandas:
            X_data = pd.DataFrame(
                data=X_data,
                columns=self.lags_names,
                index=train_index
            )

    # Target values
    y_data = y[self.window_size:]

    return X_data, y_data


In [19]:
y = np.arange(10000)
forecaster = ForecasterRecursive(
    regressor=LinearRegression(),
    lags=100
)

In [20]:
%%timeit
_create_lags(
    forecaster,
    y
)

2.04 ms ± 24.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
%%timeit
_create_lags_new(
    forecaster,
    y
)

847 μs ± 10.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
