In [7]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/joaquin/Documents/GitHub/skforecast


In [8]:
import pandas as pd
import warnings

def get_features_range(df: pd.DataFrame | pd.Series) -> dict:
    """
    Get a summary of the features in the DataFrame or Series. For numeric features,
    it returns the min and max values. For categorical features, it returns the
    unique values.

    Arguments
    ---------
    df : pd.DataFrame or pd.Series
        Input data to summarize.


    Returns
    -------
    dict
        Summary of the features in the input data.

    """
    if isinstance(df, pd.Series):
        df = df.to_frame()

    num_cols = [col for col, dt in df.dtypes.items() if np.issubdtype(dt, np.number)]
    cat_cols = [col for col in df.columns if col not in num_cols]
    
    features_ranges = {col: (df[col].min(), df[col].max()) for col in num_cols}
    features_ranges.update({col: set(df[col].dropna().unique()) for col in cat_cols})

    return features_ranges


def check_features_range(features_ranges: dict, X: pd.DataFrame | pd.Series) -> bool:
    """
    Check if there is any value outside the training range. For numeric features,
    it checks if the values are within the min and max range. For categorical features,
    it checks if the values are among the seen categories.

    Parameters
    ----------
    features_ranges : dict
        Output from get_feature_summary()
    X : pd.DataFrame or pd.Series
        New data to validate
    """

    if isinstance(X, pd.Series):
        X = X.to_frame()

    for col in set(X.columns).intersection(features_ranges.keys()):
        rule = features_ranges[col]
        if isinstance(rule, tuple):  # numeric
            if X[col].min() < rule[0] or X[col].max() > rule[1]:
                warnings.warn(
                    f"{col} has values outside the range seen during training: "
                    f"[{rule[0]:.5f}, {rule[1]:.5f}]. "
                    f"This may affect the accuracy of the predictions."
                )
        else:  # categorical
            if (~X[col].isin(rule)).any():
                warnings.warn(
                    f"{col} has values not seen during training: "
                    f"{rule}. This may affect the accuracy of the predictions."
                )

    return



df = pd.DataFrame()
for i in range(5):
    df[f"feature_{i}"] = np.random.rand(5000)

for i in range(5):
    df[f"cat_feature_{i}"] = np.random.choice(["A", "B", "C"], 5000)

new_data_valid = pd.DataFrame({
    'feature_0': [0.1, 0.1, 0.1, 0.1, 0.1],
    'cat_feature_0': ['A', 'B', 'C', 'A', 'B']
})

new_data_invalid = pd.DataFrame({
    'feature_0': [0.1, 0.1, 0.1, 0.1, 10],
    'cat_feature_0': ['A', 'B', 'C', 'A', 'D']
})

ranges = get_features_range(df)
check_features_range(ranges, new_data_valid)
check_features_range(ranges, new_data_invalid)

