## scikit-learn sample_weight compliance report

This notebook runs compliance tests on all scikit-learn estimators. Estimator as inspected to check whether they are expected to have a stochastic fit or not. If the fit is stochastic, a dedicated statistical test is performed, otherwise a deterministic estimator check is run instead.

In [1]:
import os
os.environ["SCIPY_ARRAY_API"] = "1"

In [2]:
import sklearn

sklearn.show_versions()


System:
    python: 3.12.8 | packaged by conda-forge | (main, Dec  5 2024, 14:19:53) [Clang 18.1.8 ]
executable: /Users/ogrisel/miniforge3/envs/dev/bin/python
   machine: macOS-15.2-arm64-arm-64bit

Python dependencies:
      sklearn: 1.7.dev0
          pip: 25.0
   setuptools: 75.8.0
        numpy: 2.1.3
        scipy: 1.15.1
       Cython: 3.0.11
       pandas: 2.2.3
   matplotlib: 3.10.0
       joblib: 1.5.dev0
threadpoolctl: 3.5.0

Built with OpenMP: True

threadpoolctl info:
       user_api: openmp
   internal_api: openmp
    num_threads: 8
         prefix: libomp
       filepath: /Users/ogrisel/miniforge3/envs/dev/lib/libomp.dylib
        version: None


In [None]:
from inspect import signature
import traceback
import warnings
import pandas as pd
from sklearn.utils import all_estimators
from sklearn.utils.estimator_checks import check_sample_weight_equivalence_on_dense_data
from sklearn.exceptions import ConvergenceWarning
import threadpoolctl

from sample_weight_audit import check_weighted_repeated_estimator_fit_equivalence
from sample_weight_audit.exceptions import UnexpectedDeterministicPredictions
from sample_weight_audit.sklearn_stochastic_params import STOCHASTIC_FIT_PARAMS

# HistGradientBoostingClassifier trashes the OpenMP thread pool on repeated
# small fits.
threadpoolctl.threadpool_limits(limits=1, user_api="openmp")
warnings.filterwarnings("ignore", category=RuntimeWarning)  # division by zero in AdaBoost
warnings.filterwarnings("ignore", category=ConvergenceWarning)  # liblinear can fail to converge
warnings.filterwarnings("ignore", category=UserWarning)  # KBinsDiscretizer with collapsed bins

In [4]:
from sklearn.linear_model import LogisticRegressionCV

ESTIMATORS_TO_SKIP = [
    LogisticRegressionCV,  # too slow and already somewhat tested by LogisticRegression
]

In [None]:
N_STOCHASTIC_FITS = 100
TEST_THRESHOLD = 0.05


statistical_test_results = []
deterministic_test_results = []
missing_sample_weight_support = []
errors = []


for est_name, est_class in all_estimators(
    type_filter=["classifier", "regressor", "cluster", "transformer"]
):
    if est_class in ESTIMATORS_TO_SKIP:
        print(f"Skipping {est_name}")
        continue

    if "sample_weight" not in signature(est_class.fit).parameters:
        print(f"⚠ {est_name} does not support sample_weight")
        missing_sample_weight_support.append(est_name)
        continue

    try:
        est = est_class(**STOCHASTIC_FIT_PARAMS.get(est_class, {}))
    except TypeError as e:
        print(f"⚠ {est_name} failed to instantiate: {e}")
        continue

    if "random_state" not in est.get_params():
        # TODO: leverage sklearn's PER_ESTIMATOR_CHECKS_PARAMS config to run
        # this check on valid parametrizations for the deterministic case.
        try:
            check_sample_weight_equivalence_on_dense_data(est_name, est)
            print(f"✅ {est} passed the deterministic check")
            deterministic_test_results.append((est, None))
        except Exception as e:
            print(f"❌ {est} failed the deterministic check")
            deterministic_test_results.append((est, e))
        continue


    print(f"Evaluating {est}")
    try:
        result = check_weighted_repeated_estimator_fit_equivalence(
            est,
            test_name="kstest",
            n_stochastic_fits=N_STOCHASTIC_FITS,
            random_state=0,
        )
        pass_or_fail = "✅" if result.p_value > TEST_THRESHOLD else "❌"
        print(
            f"{pass_or_fail} {est_name}: (p_value: {result.p_value:.3f})"
        )
        statistical_test_results.append(result)
    except UnexpectedDeterministicPredictions:
        # The estimator parametrization led to deterministic behavior, which is
        # unexpected. Run the deterministic check to investigate instead.
        print(f"⚠ {est_name} with different random states led to the same predictions")
        try:
            check_sample_weight_equivalence_on_dense_data(
                est_name, est.set_params(random_state=0)
            )
            print(f"✅ {est} passed the deterministic check")
            deterministic_test_results.append((est, None))
        except Exception as e:
            print(f"❌ {est} failed the deterministic check")
            deterministic_test_results.append((est, e))
    except Exception as e:
        print(f"❌ {est} error with: {e}")
        errors.append((est, e))

results_df = pd.DataFrame([r.to_dict() for r in statistical_test_results])

⚠ ARDRegression does not support sample_weight
Evaluating AdaBoostClassifier(estimator=DecisionTreeClassifier(max_features=0.5,
                                                    min_weight_fraction_leaf=0.1))


100%|██████████| 100/100 [00:07<00:00, 13.79it/s]


✅ AdaBoostClassifier: (p_value: 0.815)
Evaluating AdaBoostRegressor(estimator=DecisionTreeRegressor(max_features=0.5,
                                                  min_weight_fraction_leaf=0.1))


 46%|████▌     | 46/100 [00:02<00:03, 17.01it/s]

In [None]:
print(
    f"✅ {len([r for r in deterministic_test_results if r[1] is None])} "
    "passed the deterministic test"
)
print(
    f"❌ {len([r for r in deterministic_test_results if r[1] is not None])} "
    "failed the deterministic test"
)
print(
    f"✅ {len([r for r in statistical_test_results if r.p_value > TEST_THRESHOLD])} "
    "passed the statistical test"
)
print(
    f"❌ {len([r for r in statistical_test_results if r.p_value <= TEST_THRESHOLD])} "
    "failed the statistical test"
)
print(f"❌ {len(errors)} other errors")
print(
    f"⚠ {len(missing_sample_weight_support)} estimators lack sample_weight "
    "support"
)
results_df = pd.DataFrame([r.to_dict() for r in statistical_test_results])

## Details on the statistical test results

In [None]:
results_df.sort_values("p_value")[["estimator_name", "p_value"]]

## Details on deterministic test errors

In [None]:
import sys

for est, e in deterministic_test_results:
    if e is None:
        continue

    print(f"❌ {est}: {e}")
    traceback.print_exception(e, file=sys.stdout)
    print()

## Details on other errors

In [None]:
import sys

for est, e in errors:
    print(f"❌ {est}: {e}")
    traceback.print_exception(e, file=sys.stdout)
    print()

## List of estimators with missing sample_weight support

In [None]:
for est_name in missing_sample_weight_support:
    print(est_name)