# Profiling

In [12]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'/home/ximo/GitHub/skforecast'

In [13]:
import platform
import psutil
import skforecast
import pandas as pd
import numpy as np
import scipy
import sklearn

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor

from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import grid_search_forecaster_multiseries
from skforecast.model_selection import bayesian_search_forecaster_multiseries
from skforecast.model_selection import backtesting_forecaster_multiseries
from skforecast.utils import *

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from skforecast.preprocessing import series_long_to_dict
from skforecast.preprocessing import exog_long_to_dict
from skforecast.datasets import fetch_dataset

%load_ext pyinstrument
%load_ext line_profiler

The pyinstrument extension is already loaded. To reload it, use:
  %reload_ext pyinstrument
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


# Information system and libraries

In [14]:
# Versions
# ==============================================================================
print(f"Python version: {platform.python_version()}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"skforecast version: {skforecast.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"scipy version: {scipy.__version__}")
print(f"psutil version: {psutil.__version__}")
print("")

# Computer information
# ==============================================================================
#Computer network name
print(f"Computer network name: {platform.node()}")
#Machine type
print(f"Machine type: {platform.machine()}")
#Processor type
print(f"Processor type: {platform.processor()}")
#Platform type
print(f"Platform type: {platform.platform()}")
#Operating system
print(f"Operating system: {platform.system()}")
#Operating system release
print(f"Operating system release: {platform.release()}")
#Operating system version
print(f"Operating system version: {platform.version()}")
#Physical cores
print(f"Number of physical cores: {psutil.cpu_count(logical=False)}")
#Logical cores
print(f"Number of logical cores: {psutil.cpu_count(logical=True)}")

Python version: 3.12.9
scikit-learn version: 1.6.1
skforecast version: 0.16.0
pandas version: 2.2.3
numpy version: 2.0.2
scipy version: 1.15.2
psutil version: 5.9.0

Computer network name: ximo
Machine type: x86_64
Processor type: x86_64
Platform type: Linux-6.8.0-57-generic-x86_64-with-glibc2.39
Operating system: Linux
Operating system release: 6.8.0-57-generic
Operating system version: #59-Ubuntu SMP PREEMPT_DYNAMIC Sat Mar 15 17:40:59 UTC 2025
Number of physical cores: 4
Number of logical cores: 8


# ForecasterAutoregMultiSeries

In [37]:
n_series = 200
len_series = (2000, 5000)
series_dict = {}
rng = np.random.default_rng(321)
for i in range(n_series):
    n = rng.integers(low=len_series[0], high=len_series[1])
    series_dict[f'series_{i}'] = pd.Series(
        data = rng.normal(loc=20, scale=5, size=n),
        index=pd.date_range(
            start='2010-01-01',
            periods=n,
            freq='h'
        ),
        name=f'series_{i}'
    )

exog_dict = {}
rng = np.random.default_rng(321)
for k in series_dict.keys():
    exog = pd.DataFrame(
            index=series_dict[k].index
            )
    exog['day_of_week'] = exog.index.dayofweek
    exog['week_of_year'] = exog.index.isocalendar().week.astype(int)
    exog['month'] = exog.index.month
    exog_dict[k] = exog


print(f"Range of dates: "
    f"{np.min([series_dict[k].index.min() for k in series_dict.keys()])} - "
    f"{np.max([series_dict[k].index.max() for k in series_dict.keys()])}"
)

Range of dates: 2010-01-01 00:00:00 - 2010-07-27 09:00:00


In [38]:
end_train = '2010-06-01 00:00:00'

In [None]:
# Forecaster
# ==============================================================================
forecaster = ForecasterRecursiveMultiSeries(
    regressor=LGBMRegressor(random_state=8520, verbose=-1),
    lags=50,
    # transformer_series=StandardScaler(),
    # transformer_exog=StandardScaler(),
    encoding="ordinal"
)

In [53]:
#%%pyinstrument

# forecaster.fit(series=series_dict, exog=exog_dict)

In [54]:
# Profiling fit()
# ==============================================================================
# def funt_to_profile(forecaster, series, exog):
#     forecaster.fit(series=series, exog=exog)

# %lprun -f forecaster.fit funt_to_profile(forecaster, series_dict, exog_dict)

In [55]:
# Profiling _create_train_X_y()
# ==============================================================================
def funt_to_profile(forecaster, series, exog):
    forecaster._create_train_X_y(series=series, exog=exog)

%lprun -f forecaster._create_train_X_y funt_to_profile(forecaster, series_dict, exog_dict)

Timer unit: 1e-09 s

Total time: 1.0561 s
File: /home/ximo/GitHub/skforecast/skforecast/recursive/_forecaster_recursive_multiseries.py
Function: _create_train_X_y at line 961

Line #      Hits         Time  Per Hit   % Time  Line Contents
   961                                               def _create_train_X_y(
   962                                                   self,
   963                                                   series: pd.DataFrame | dict[str, pd.Series | pd.DataFrame],
   964                                                   exog: pd.Series | pd.DataFrame | dict[str, pd.Series | pd.DataFrame] | None = None,
   965                                                   store_last_window: bool | list[str] = True,
   966                                               ) -> tuple[
   967                                                   pd.DataFrame,
   968                                                   pd.Series,
   969                                                   di

In [None]:
# Profiling _create_train_X_y_single_series()
# ==============================================================================
def funt_to_profile(forecaster, series, exog):
    (
    X_train_autoreg,
    X_train_window_features_names_out_,
    X_train_exog,
    y_train
) = forecaster._create_train_X_y_single_series(
        y = series,
        exog = exog,
        ignore_exog = False,
    )

%lprun -f forecaster._create_train_X_y_single_series funt_to_profile(forecaster, series_dict['series_0'], exog_dict['series_0'])

Timer unit: 1e-09 s

Total time: 0.00234685 s
File: /home/ximo/GitHub/skforecast/skforecast/recursive/_forecaster_recursive_multiseries.py
Function: _create_train_X_y_single_series at line 836

Line #      Hits         Time  Per Hit   % Time  Line Contents
   836                                               def _create_train_X_y_single_series(
   837                                                   self,
   838                                                   y: pd.Series,
   839                                                   ignore_exog: bool,
   840                                                   exog: pd.DataFrame | None = None
   841                                               ) -> tuple[pd.DataFrame, list[str], pd.DataFrame, pd.Series]:
   842                                                   """
   843                                                   Create training matrices from univariate time series and exogenous
   844                                               

In [18]:
# Profiling predict()
# ==============================================================================
def funt_to_profile(forecaster, steps, exog):
    forecaster.predict(steps=steps, exog=exog, suppress_warnings=True)

%lprun -f forecaster.predict funt_to_profile(forecaster, 7, exog_dict_valid)

Timer unit: 1e-09 s

Total time: 36.6131 s
File: /home/ubuntu/varios/skforecast/skforecast/ForecasterAutoregMultiSeries/ForecasterAutoregMultiSeries.py
Function: predict at line 1537

Line #      Hits         Time  Per Hit   % Time  Line Contents
  1537                                               def predict(
  1538                                                   self,
  1539                                                   steps: int,
  1540                                                   levels: Optional[Union[str, list]]=None,
  1541                                                   last_window: Optional[pd.DataFrame]=None,
  1542                                                   exog: Optional[Union[pd.Series, pd.DataFrame, dict]]=None,
  1544                                               ) -> pd.DataFrame:
  1545                                                   """
  1546                                                   Predict n steps ahead. It is an recursive process in

In [19]:
# Functions to profile:
# ==============================================================================
# check_preprocess_exog_multiseries
# align_series_and_exog_multiseries
# _create_train_X_y_single_series
# _create_predict_inputs
# _recursive_predict

In [20]:
# Profiling align_series_and_exog_multiseries()
# ==============================================================================
def funt_to_profile(series_dict, input_series_is_dict, exog_dict):
    align_series_and_exog_multiseries(
        series_dict=series_dict,
        input_series_is_dict=input_series_is_dict,
        exog_dict = exog_dict,
    )

%lprun -f align_series_and_exog_multiseries funt_to_profile(series_dict_train, True, exog_dict_train)

Timer unit: 1e-09 s

Total time: 2.26484 s
File: /home/ubuntu/varios/skforecast/skforecast/utils/utils.py
Function: align_series_and_exog_multiseries at line 2532

Line #      Hits         Time  Per Hit   % Time  Line Contents
  2532                                           def align_series_and_exog_multiseries(
  2533                                               series_dict: dict,
  2534                                               input_series_is_dict: bool,
  2535                                               exog_dict: dict=None
  2536                                           ) -> Tuple[Union[pd.Series, pd.DataFrame], Union[pd.Series, pd.DataFrame]]:
  2537                                               """
  2538                                               Align series and exog according to their index. If needed, reindexing is
  2539                                               applied. Heading and trailing NaNs are removed from all series in 
  2540                        

In [23]:
# Profiling check_preprocess_exog_multiseries()
# ==============================================================================
series_indexes = {k: v.index for k, v  in series_dict_train.items()}
series_col_names = list(series_dict_train.keys())

def funt_to_profile(input_series_is_dict, series_indexes, series_col_names, exog, exog_dict):
    check_preprocess_exog_multiseries(
        input_series_is_dict = input_series_is_dict,
        series_indexes = series_indexes,
        series_col_names = series_col_names,
        exog = exog_dict_train,
        exog_dict = exog_dict_train,
    )

%lprun -f check_preprocess_exog_multiseries funt_to_profile(True, series_indexes, series_col_names, exog, exog_dict)

Timer unit: 1e-09 s

Total time: 0.911268 s
File: /home/ubuntu/varios/skforecast/skforecast/utils/utils.py
Function: check_preprocess_exog_multiseries at line 2361

Line #      Hits         Time  Per Hit   % Time  Line Contents
  2361                                           def check_preprocess_exog_multiseries(
  2362                                               input_series_is_dict: bool,
  2363                                               series_indexes: dict,
  2364                                               series_col_names: list,
  2365                                               exog: Union[pd.Series, pd.DataFrame, dict],
  2366                                               exog_dict: dict,
  2367                                           ) -> Tuple[dict, list]:
  2368                                               """
  2369                                               Check and preprocess `exog` argument in `ForecasterAutoregMultiSeries` and
  2370                    

In [None]:
import pandas as pd
import numpy as np
import time

def fun_original():
    pass

def fun_optimized_1():
    pass

start_time = time.time()
try:
    fun_original()
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Original code execution time: {end_time - start_time} seconds")

start_time = time.time()
try:
    fun_optimized_1()
except TypeError as e:
    end_time = time.time()
    print(e)
finally:
    end_time = time.time()
print(f"Optimized code 1 execution time: {end_time - start_time} seconds")


Original code execution time: 9.608268737792969e-05 seconds
Optimized code 1 execution time: 7.390975952148438e-05 seconds


In [6]:
import numpy as np
import time
from sklearn.preprocessing import OrdinalEncoder

# Fast Ordinal Encoder implementation
class FastOrdinalEncoder:
    def __init__(self):
        self.category_map = None
        self.inverse_category_map = None

    def fit(self, data):
        """Fit the encoder to the unique categories in the data."""
        unique_categories = np.unique(data)
        self.category_map = {category: idx for idx, category in enumerate(unique_categories)}
        self.inverse_category_map = {idx: category for idx, category in enumerate(unique_categories)}
    
    def transform(self, data):
        """Transform the data to ordinal values."""
        return np.vectorize(self.category_map.get)(data)
    
    def fit_transform(self, data):
        """Fit and transform the data."""
        self.fit(data)
        return self.transform(data)
    
    def inverse_transform(self, encoded_data):
        """Inverse transform the encoded data back to original categories."""
        return np.vectorize(self.inverse_category_map.get)(encoded_data)
    

import numpy as np

class UltraFastOrdinalEncoder:
    def __init__(self):
        self.category_map = None
        self.inverse_category_map = None
        self.categories = None

    def fit(self, data):
        """Fit the encoder to the unique categories in the data."""
        # Get unique categories once
        self.categories = np.unique(data)
        
        # Create category map: category -> integer
        self.category_map = {category: idx for idx, category in enumerate(self.categories)}
        
        # Create inverse category map: integer -> category
        self.inverse_category_map = {idx: category for idx, category in enumerate(self.categories)}
    
    def transform(self, data):
        """Transform the data to ordinal values using direct indexing."""
        # Convert data into the corresponding integers using vectorized operations
        # Return -1 for unknown categories
        encoded_data = np.searchsorted(self.categories, data)
        encoded_data[encoded_data == len(self.categories)] = -1  # Handle unknown values by returning -1
        return encoded_data

    def fit_transform(self, data):
        """Fit and transform the data."""
        self.fit(data)
        return self.transform(data)
    
    def inverse_transform(self, encoded_data):
        """Inverse transform the encoded data back to original categories."""
        # Directly map the encoded values back to the categories using inverse category map
        return self.categories[encoded_data]


# Create a large synthetic dataset of strings (categories)
np.random.seed(42)
categories = ['red', 'green', 'blue', 'yellow', 'purple', 'orange', 'pink']
data = np.random.choice(categories, size=1000000)

# Benchmark sklearn OrdinalEncoder
start_time = time.time()
sklearn_encoder = OrdinalEncoder()
sklearn_encoded_data = sklearn_encoder.fit_transform(data.reshape(-1, 1))
sklearn_time = time.time() - start_time
print(f"Sklearn OrdinalEncoder time: {sklearn_time:.4f} seconds")

# Benchmark FastOrdinalEncoder
start_time = time.time()
fast_encoder = FastOrdinalEncoder()
fast_encoded_data = fast_encoder.fit_transform(data)
fast_time = time.time() - start_time
print(f"FastOrdinalEncoder time: {fast_time:.4f} seconds")

# Benchmark UltraFastOrdinalEncoder
start_time = time.time()
ultra_fast_encoder = UltraFastOrdinalEncoder()
ultra_fast_encoded_data = ultra_fast_encoder.fit_transform(data)
ultra_fast_time = time.time() - start_time
print(f"UltraFastOrdinalEncoder time: {ultra_fast_time:.4f} seconds")



Sklearn OrdinalEncoder time: 0.6688 seconds
FastOrdinalEncoder time: 0.2887 seconds
UltraFastOrdinalEncoder time: 0.1600 seconds
