# Profiling

In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\jaesc2\\GitHub\\skforecast'

In [2]:
import platform
import psutil
import skforecast
import pandas as pd
import numpy as np
import scipy
import sklearn

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor

from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.model_selection import grid_search_forecaster_multiseries
from skforecast.model_selection import bayesian_search_forecaster_multiseries
from skforecast.model_selection import backtesting_forecaster_multiseries
from skforecast.utils import *

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from skforecast.preprocessing import series_long_to_dict
from skforecast.preprocessing import exog_long_to_dict
from skforecast.datasets import fetch_dataset

%load_ext pyinstrument
%load_ext line_profiler

# Information system and libraries

In [3]:
# Versions
# ==============================================================================
print(f"Python version: {platform.python_version()}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"skforecast version: {skforecast.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"scipy version: {scipy.__version__}")
print(f"psutil version: {psutil.__version__}")
print("")

# Computer information
# ==============================================================================
#Computer network name
print(f"Computer network name: {platform.node()}")
#Machine type
print(f"Machine type: {platform.machine()}")
#Processor type
print(f"Processor type: {platform.processor()}")
#Platform type
print(f"Platform type: {platform.platform()}")
#Operating system
print(f"Operating system: {platform.system()}")
#Operating system release
print(f"Operating system release: {platform.release()}")
#Operating system version
print(f"Operating system version: {platform.version()}")
#Physical cores
print(f"Number of physical cores: {psutil.cpu_count(logical=False)}")
#Logical cores
print(f"Number of logical cores: {psutil.cpu_count(logical=True)}")

Python version: 3.11.10
scikit-learn version: 1.6.1
skforecast version: 0.16.0
pandas version: 2.2.3
numpy version: 2.1.3
scipy version: 1.15.2
psutil version: 7.0.0

Computer network name: ITES015-NB0029
Machine type: AMD64
Processor type: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
Platform type: Windows-10-10.0.19045-SP0
Operating system: Windows
Operating system release: 10
Operating system version: 10.0.19045
Number of physical cores: 8
Number of logical cores: 16


In [None]:
import numpy as np
import time
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

class FastOrdinalEncoder:
    """
    Encode categorical values as an integer array, with integer values
    from 0 to n_categories - 1.

    This encoder mimics the behavior of sklearn's OrdinalEncoder but during the
    fit, categories are not learned from the data. Instead, the user must provide
    a list of unique categories. This is useful when the categories are known
    beforehand and the data is large.

    Parameters
    ----------

    Attributes
    ----------
    categories_ : np.ndarray
        Unique categories in the data.
    category_map_ : dict
        Mapping of categories to integers.
    inverse_category_map_ : dict
        Mapping of integers to categories.
    unknown_value : int | float, default=-1
        Value to use for unknown categories.
    
    """


    def __init__(self, unknown_value: int | float = -1):

        self.unknown_value = unknown_value
        self.categories_ = None
        self.category_map_ = None
        self.inverse_category_map_ = None
        
    def fit(self, categories: list | np.ndarray) -> None:
        """
        Fit the encoder using the provided categories.

        Parameters
        ----------
        categories : list | np.ndarray
            Unique categories used to fit the encoder.
        """

        if not isinstance(categories, (list, np.ndarray)):
            raise ValueError("Categories must be a list or numpy array.")
        if len(categories) == 0:
            raise ValueError("Categories cannot be empty.")

        self.categories_ = np.sort(categories)
        self.category_map_ = {category: idx for idx, category in enumerate(self.categories_)}
        self.inverse_category_map_ = {idx: category for idx, category in enumerate(self.categories_)}
    
    def transform(self, X: np.ndarray | pd.Series) -> pd.Series:
        """
        Transform the data to ordinal values using direct indexing.

        Parameters
        ----------
        X : np.ndarray | pd.Series
            Input data to transform.

        Returns
        -------
        pd.Series
            Transformed data with ordinal values.

        """

        if self.categories_ is None:
            raise ValueError(
                "The encoder has not been fitted yet. Call 'fit' before 'transform'."
            )
        if not isinstance(X, (np.ndarray, pd.Series)):
            raise ValueError("Input data must be a numpy array or pandas Series.")
        
        encoded_data = pd.Series(X).map(self.category_map_)

        return encoded_data

    
    def inverse_transform(self, X: np.ndarray | pd.Series) -> pd.Series:
        """
        Inverse transform the encoded data back to original categories.

        Parameters
        ----------
        X : np.ndarray | pd.Series
            Encoded data to inverse transform.

        Returns
        -------
        pd.Series
            Inverse transformed data with original categories.
        """

        if self.categories_ is None:
            raise ValueError(
                "The encoder has not been fitted yet. Call 'fit' before 'inverse_transform'."
            )
        if not isinstance(X, (np.ndarray, pd.Series)):
            raise ValueError("Input data must be a numpy array or pandas Series.")
        
        inverse_encoded_data = (
            pd.Series(X)
            .map(self.inverse_category_map_)
        )

        return inverse_encoded_data
    

# Create a large synthetic dataset of strings (categories)
np.random.seed(42)
categories = [f"category_{i}" for i in range(500)]
data = np.repeat(categories, 1000)
data_series = pd.Series(data)
data_df = pd.DataFrame(data, columns=['y'])

# Benchmark sklearn OrdinalEncoder
start_time = time.time()
sklearn_encoder = OrdinalEncoder()
sklearn_encoded_data = sklearn_encoder.fit_transform(data_df)
sklearn_time = time.time() - start_time
print(f"Sklearn OrdinalEncoder time: {sklearn_time:.4f} seconds")

# Benchmark FastOrdinalEncoder
start_time = time.time()
fast_encoder = FastOrdinalEncoder()
fast_encoder.fit(categories)
fast_encoded_data = fast_encoder.transform(data_series)
fast_time = time.time() - start_time
print(f"FastOrdinalEncoder time: {fast_time:.4f} seconds")

# Check if the results are the same
assert np.array_equal(sklearn_encoded_data.flatten(), fast_encoded_data)
print("All encoders produced the same results.")

# check inverse transform
assert np.array_equal(data, fast_encoder.inverse_transform(fast_encoded_data))
print("Inverse transform works correctly.")

Sklearn OrdinalEncoder time: 0.1126 seconds
FastOrdinalEncoder time: 0.0297 seconds
All encoders produced the same results.
Inverse transform works correctly.


In [6]:
categories = [f"category_{i}" for i in range(500)]
data = np.repeat(categories, 1000)
data_series = pd.Series(data)
data_series

0           category_0
1           category_0
2           category_0
3           category_0
4           category_0
              ...     
499995    category_499
499996    category_499
499997    category_499
499998    category_499
499999    category_499
Length: 500000, dtype: object