In [1]:
import os
os.environ["SCIPY_ARRAY_API"] = "1"

In [2]:
import pandas as pd
import warnings
from inspect import signature
from sklearn.utils import all_estimators
from sklearn.exceptions import ConvergenceWarning
import threadpoolctl

from sample_weight_audit import check_weighted_repeated_estimator_fit_equivalence
from sample_weight_audit.sklearn_stochastic_params import STOCHASTIC_FIT_PARAMS 


threadpoolctl.threadpool_limits(limits=1, user_api="openmp")  # HistGradientBoostingClassifier is trashing.
warnings.filterwarnings("ignore", category=RuntimeWarning)  # division by zero in AdaBoost
warnings.filterwarnings("ignore", category=ConvergenceWarning)  # Liblinear can fail to converge

In [None]:
results = []
# XXX: how to handle clustering estimators?
for est_name, est_class in all_estimators(
    type_filter=["classifier", "regressor", "transformer"]
):
    if "random_state" not in signature(est_class.__init__).parameters:
        # Skip estimators with a deterministic fit: they are better tested via
        # the check_sample_weight_equivalence_on_dense_data estimator check run
        # as part of the scikit-learn test suite.
        continue

    if "sample_weight" not in signature(est_class.fit).parameters:
        print(f"❌ {est_name} does not support sample_weight")
        continue

    print(f"Evaluating {est_name}")
    est = est_class(**STOCHASTIC_FIT_PARAMS.get(est_class, {}))
    try:
        result = check_weighted_repeated_estimator_fit_equivalence(
            est,
            test_name="kstest",
            n_stochastic_fits=30,
        )
        pass_or_fail = "✅" if result.min_p_value > 0.05 else "❌"
        print(
            f"{pass_or_fail} {est_name}: (min_p_value: {result.min_p_value:.3f}, "
            f"mean_p_value={result.mean_p_value:.3f})"
        )
        results.append(result)
    except ValueError:
        # XXX: Use the deterministic check instead?
        print(f"❌ {est_name} with different random states led to the same predictions")


results_df = pd.DataFrame([r.to_dict() for r in results])

Evaluating AdaBoostClassifier


100%|██████████| 30/30 [00:02<00:00, 10.44it/s]


❌ AdaBoostClassifier with different random states led to the same predictions
Evaluating AdaBoostRegressor


100%|██████████| 30/30 [00:02<00:00, 11.29it/s]


✅ AdaBoostRegressor: (min_p_value: 0.135, mean_p_value=0.724)
Evaluating BaggingClassifier


100%|██████████| 30/30 [00:00<00:00, 46.76it/s]


❌ BaggingClassifier: (min_p_value: 0.000, mean_p_value=0.324)
Evaluating BaggingRegressor


100%|██████████| 30/30 [00:00<00:00, 34.59it/s]


❌ BaggingRegressor: (min_p_value: 0.000, mean_p_value=0.214)
❌ BernoulliRBM does not support sample_weight
Evaluating BisectingKMeans


100%|██████████| 30/30 [00:00<00:00, 126.01it/s]


❌ BisectingKMeans with different random states led to the same predictions
❌ ClassifierChain does not support sample_weight
Evaluating DecisionTreeClassifier


100%|██████████| 30/30 [00:00<00:00, 265.80it/s]


✅ DecisionTreeClassifier: (min_p_value: 1.000, mean_p_value=1.000)
Evaluating DecisionTreeRegressor


100%|██████████| 30/30 [00:00<00:00, 334.03it/s]


❌ DecisionTreeRegressor: (min_p_value: 0.007, mean_p_value=0.435)
❌ DictionaryLearning does not support sample_weight
Evaluating DummyClassifier


100%|██████████| 30/30 [00:00<00:00, 533.87it/s]


✅ DummyClassifier: (min_p_value: 1.000, mean_p_value=1.000)
Evaluating ElasticNet


100%|██████████| 30/30 [00:00<00:00, 444.23it/s]


✅ ElasticNet: (min_p_value: 1.000, mean_p_value=1.000)
Evaluating ElasticNetCV


100%|██████████| 30/30 [00:01<00:00, 27.96it/s]


✅ ElasticNetCV: (min_p_value: 1.000, mean_p_value=1.000)
Evaluating ExtraTreeClassifier


100%|██████████| 30/30 [00:00<00:00, 621.85it/s]


✅ ExtraTreeClassifier: (min_p_value: 1.000, mean_p_value=1.000)
Evaluating ExtraTreeRegressor


100%|██████████| 30/30 [00:00<00:00, 692.62it/s]


✅ ExtraTreeRegressor: (min_p_value: 0.393, mean_p_value=0.752)
Evaluating ExtraTreesClassifier


100%|██████████| 30/30 [00:03<00:00,  9.03it/s]


✅ ExtraTreesClassifier: (min_p_value: 1.000, mean_p_value=1.000)
Evaluating ExtraTreesRegressor


100%|██████████| 30/30 [00:03<00:00,  8.94it/s]


✅ ExtraTreesRegressor: (min_p_value: 0.135, mean_p_value=0.629)
❌ FactorAnalysis does not support sample_weight
❌ FastICA does not support sample_weight
❌ GaussianProcessClassifier does not support sample_weight
❌ GaussianProcessRegressor does not support sample_weight
❌ GaussianRandomProjection does not support sample_weight
Evaluating GradientBoostingClassifier


100%|██████████| 30/30 [00:07<00:00,  4.00it/s]


✅ GradientBoostingClassifier: (min_p_value: 0.239, mean_p_value=0.932)
Evaluating GradientBoostingRegressor


100%|██████████| 30/30 [00:01<00:00, 21.49it/s]


✅ GradientBoostingRegressor: (min_p_value: 0.393, mean_p_value=0.833)
Evaluating HistGradientBoostingClassifier


100%|██████████| 30/30 [00:03<00:00,  9.88it/s]


❌ HistGradientBoostingClassifier: (min_p_value: 0.000, mean_p_value=0.055)
Evaluating HistGradientBoostingRegressor


100%|██████████| 30/30 [00:01<00:00, 27.25it/s]


❌ HistGradientBoostingRegressor: (min_p_value: 0.000, mean_p_value=0.092)
Evaluating KBinsDiscretizer
❌ KBinsDiscretizer with different random states led to the same predictions
Evaluating KMeans


100%|██████████| 30/30 [00:00<00:00, 359.23it/s]


❌ KMeans with different random states led to the same predictions
❌ KernelPCA does not support sample_weight
❌ Lars does not support sample_weight
Evaluating Lasso


100%|██████████| 30/30 [00:00<00:00, 303.76it/s]


✅ Lasso: (min_p_value: 1.000, mean_p_value=1.000)
Evaluating LassoCV


100%|██████████| 30/30 [00:00<00:00, 30.85it/s]


✅ LassoCV: (min_p_value: 1.000, mean_p_value=1.000)
❌ LassoLars does not support sample_weight
❌ LatentDirichletAllocation does not support sample_weight
Evaluating LinearSVC


100%|██████████| 30/30 [00:00<00:00, 53.27it/s]


❌ LinearSVC: (min_p_value: 0.000, mean_p_value=0.000)
Evaluating LinearSVR


100%|██████████| 30/30 [00:00<00:00, 333.55it/s]


❌ LinearSVR: (min_p_value: 0.000, mean_p_value=0.011)
❌ LocallyLinearEmbedding does not support sample_weight
Evaluating LogisticRegression


100%|██████████| 30/30 [00:00<00:00, 73.00it/s]


❌ LogisticRegression: (min_p_value: 0.000, mean_p_value=0.113)
Evaluating LogisticRegressionCV


 60%|██████    | 18/30 [02:17<01:32,  7.69s/it]

In [None]:
results_df.sort_values("min_p_value")