In [18]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/joaquin/Documents/GitHub/skforecast


In [5]:
import numpy as np
import pandas as pd
from typing import List, Optional
from scipy.spatial.distance import jensenshannon

class DriftDetector:
    """
    Univariate drift detector per feature using training histograms only (no raw data kept).

    fit():
      - Builds per-feature histograms (equal-width, bins=int) on numeric columns.
      - Precomputes per-bin lookup tables for:
        ks, jsd, kl, psi, p_bin and a fast Wasserstein-by-center approximation.

    predict(df: single-row DataFrame):
      - Bins each value, then returns metrics by simple array indexing.
      - Columns: ks, wasserstein, jsd, kl, psi, p_bin.

    Notes
    - Only numeric columns are used for fitting. Same set must exist at predict time.
    - Non-finite values in the new observation yield NaN metrics for those features.
    - PSI uses the standard definition: sum((actual - expected) * ln(actual/expected)).
    - Wasserstein default is a fast per-bin-center approximation (no raw values).
      Set wd_mode='exact' to compute E[|X - x0|] against centers at predict-time
      (still histogram-based; cost O(bins) per feature).
    """

    def __init__(self, bins: int = 20, eps: float = 1e-12, wd_mode: str = "center"):
        if not isinstance(bins, int) or bins <= 0:
            raise ValueError("bins must be a positive integer")
        if wd_mode not in ("center", "exact"):
            raise ValueError("wd_mode must be 'center' or 'exact'")
        self.bins = bins
        self.eps = eps
        self.wd_mode = wd_mode

        # Learned state
        self.features: Optional[List[str]] = None
        self.hist_array: Optional[np.ndarray] = None           # (n_feat, B)
        self.bin_edges_array: Optional[List[np.ndarray]] = None# list length n_feat, each (B+1,)
        self.bin_centers_array: Optional[np.ndarray] = None    # (n_feat, B)
        self.cdf_array: Optional[np.ndarray] = None            # (n_feat, B)

        # Lookups for O(1) predict (by bin index)
        self.ks_lookup: Optional[np.ndarray] = None            # (n_feat, B)
        self.jsd_lookup: Optional[np.ndarray] = None           # (n_feat, B)
        self.kl_lookup: Optional[np.ndarray] = None            # (n_feat, B)
        self.psi_lookup: Optional[np.ndarray] = None           # (n_feat, B)
        self.wd_lookup: Optional[np.ndarray] = None            # (n_feat, B) Wasserstein-by-center
        # p_bin is just hist_array

    def fit(self, df: pd.DataFrame):
        """Fit per-feature histograms on numeric columns and precompute lookups."""
        num_features = df.select_dtypes(include=[np.number]).columns
        self.features = list(num_features)
        B = self.bins

        hists, edges, centers, cdfs = [], [], [], []
        for col in self.features:
            values = df[col].dropna().values
            if values.size == 0:
                # Mark invalid feature with NaNs so predict returns NaNs
                hist = np.full(B, np.nan, dtype=float)
                bin_edges = np.linspace(0.0, 1.0, B + 1)
                bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
                cdf = np.full(B, np.nan, dtype=float)
            else:
                hist, bin_edges = np.histogram(values, bins=B)
                hist = hist.astype(float)
                hist = hist / (hist.sum() + self.eps)
                bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
                cdf = np.cumsum(hist)

            hists.append(hist)
            edges.append(bin_edges)
            centers.append(bin_centers)
            cdfs.append(cdf)

        self.hist_array = np.stack(hists, axis=0)                          # (F, B)
        self.bin_edges_array = edges
        self.bin_centers_array = np.stack(centers, axis=0)                 # (F, B)
        self.cdf_array = np.stack(cdfs, axis=0)                            # (F, B)

        # Precompute KS per bin: max(CDF[k-1], 1 - CDF[k])
        F = self.hist_array.shape[0]
        cdf = self.cdf_array
        left = np.concatenate([np.zeros((F, 1)), cdf[:, :-1]], axis=1)     # CDF before step
        right_gap = 1.0 - cdf
        self.ks_lookup = np.maximum(left, right_gap)

        # Smoothing for q (one-hot with epsilon)
        denom = 1.0 + (B - 1) * self.eps
        q_k_val = 1.0 / denom
        q_other_val = self.eps / denom

        p = np.clip(self.hist_array, self.eps, None)                       # avoid zeros
        log_p = np.log(p)

        # KL(p || q_k) vectorized using constants
        log_q_k = np.log(q_k_val)
        log_q_other = np.log(q_other_val)
        sum_p_logp = np.nansum(p * log_p, axis=1, keepdims=True)           # (F,1)
        p_k = p                                                             # alias
        # kl[:, k] = sum p log p - [(1 - p_k)*log_q_other + p_k*log_q_k]
        self.kl_lookup = sum_p_logp - ((1.0 - p_k) * log_q_other + p_k * log_q_k)

        # PSI(actual=q, expected=p): sum (q - p) * ln(q/p)
        # For each k: q_i = q_other_val (i!=k); q_k = q_k_val
        # Do it with a small loop over bins, vectorized across features
        psi = np.empty_like(self.kl_lookup)
        jsd = np.empty_like(self.kl_lookup)

        # Precompute Wasserstein-by-center lookup (fast mode): wd[k] = sum_j |c_j - c_k| p_j
        wd = np.empty_like(self.kl_lookup)
        for i in range(F):
            ci = self.bin_centers_array[i]
            pi = p[i]
            if not np.isfinite(pi).all():
                wd[i, :] = np.nan
                continue
            diff = np.abs(ci[:, None] - ci[None, :])  # (B,B) where [j,k] = |c_j - c_k|
            # wd[i, k] = sum_j p_j * |c_j - c_k|
            wd[i, :] = pi @ diff
        self.wd_lookup = wd

        # Precompute JSD and PSI lookups; JSD via SciPy (fit-time only, predict stays O(1))
        for k in range(B):
            q = np.full_like(p, q_other_val)  # (F,B)
            q[:, k] = q_k_val
            log_q = np.log(q)

            # PSI
            psi[:, k] = np.nansum((q - p) * (log_q - log_p), axis=1)

            # JSD distance (sqrt(JS, base 2)) using SciPy for correctness
            jsd[:, k] = np.array([jensenshannon(p[i], q[i]) for i in range(F)])

        self.psi_lookup = psi
        self.jsd_lookup = jsd

        return self

    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
        """Compute drift metrics for a single-row DataFrame using O(1) lookups per feature."""
        if self.hist_array is None:
            raise RuntimeError("DriftDetector is not fitted. Call fit() first.")
        if len(df) != 1:
            raise ValueError("Only single-row dataframes are supported.")
        if self.features is None:
            raise RuntimeError("Missing fitted features state.")

        missing = set(self.features) - set(df.columns)
        if missing:
            raise ValueError(f"Missing required features: {sorted(missing)}")

        row = df[self.features].iloc[0].astype(float).values  # (F,)
        F, B = self.hist_array.shape

        # Init outputs
        ks = np.full(F, np.nan, dtype=float)
        wd = np.full(F, np.nan, dtype=float)
        jsd = np.full(F, np.nan, dtype=float)
        kl = np.full(F, np.nan, dtype=float)
        psi = np.full(F, np.nan, dtype=float)
        p_bin = np.full(F, np.nan, dtype=float)

        valid_idx = np.where(np.isfinite(row))[0]
        if valid_idx.size == 0:
            return pd.DataFrame(
                {"ks": ks, "wasserstein": wd, "jsd": jsd, "kl": kl, "psi": psi, "p_bin": p_bin},
                index=self.features,
            )

        # Compute bin index with searchsorted; clip to [0, B-1]
        bin_idx = np.array([
            np.searchsorted(self.bin_edges_array[i], row[i], side="right") - 1
            for i in valid_idx
        ])
        bin_idx = np.clip(bin_idx, 0, B - 1)

        # Gather from lookups
        ks[valid_idx] = self.ks_lookup[valid_idx, bin_idx]
        jsd[valid_idx] = self.jsd_lookup[valid_idx, bin_idx]
        kl[valid_idx] = self.kl_lookup[valid_idx, bin_idx]
        psi[valid_idx] = self.psi_lookup[valid_idx, bin_idx]
        p_bin[valid_idx] = self.hist_array[valid_idx, bin_idx]

        # Wasserstein
        if self.wd_mode == "center":
            wd[valid_idx] = self.wd_lookup[valid_idx, bin_idx]
        else:
            # exact against x0 using centers and training p (still histogram-based)
            centers_valid = self.bin_centers_array[valid_idx]
            hist_valid = self.hist_array[valid_idx]
            wd[valid_idx] = np.sum(np.abs(centers_valid - row[valid_idx, None]) * hist_valid, axis=1)

        return pd.DataFrame(
            {"ks": ks, "wasserstein": wd, "jsd": jsd, "kl": kl, "psi": psi, "p_bin": p_bin},
            index=self.features,
        )

In [6]:
train_df = pd.DataFrame({
    "age": np.random.normal(40, 10, 10000),
    "income": np.random.normal(50000, 12000, 10000),
    "score": np.random.normal(700, 50, 10000)
})

new_df = pd.DataFrame({
    "age": [45],
    "income": [52000],
    "score": [720]
})

detector = DriftDetector(bins=20).fit(train_df)
metrics = detector.predict(new_df)
print(metrics)


            ks   wasserstein       jsd         kl        psi   p_bin
age     0.6803      9.543505  0.701133  21.724580  23.765570  0.1299
income  0.5405  10059.659374  0.688611  21.194249  23.115662  0.1464
score   0.5368     41.190276  0.687045  21.138187  23.045357  0.1485
