From 6b2ca6e8b10722b865973a204de141f995854946 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Feb 2024 13:08:34 -0500 Subject: [PATCH 01/22] Draft of the metadata routing for bagging Signed-off-by: Adam Li --- doc/metadata_routing.rst | 4 +- doc/whats_new/v1.5.rst | 6 ++ sklearn/ensemble/_bagging.py | 63 +++++++++-- sklearn/ensemble/tests/test_bagging.py | 100 ++++++++++++++++++ .../test_metaestimators_metadata_routing.py | 4 - 5 files changed, 165 insertions(+), 12 deletions(-) diff --git a/doc/metadata_routing.rst b/doc/metadata_routing.rst index 658357d910464..7f2719606a4dd 100644 --- a/doc/metadata_routing.rst +++ b/doc/metadata_routing.rst @@ -274,6 +274,8 @@ Meta-estimators and functions supporting metadata routing: - :class:`sklearn.calibration.CalibratedClassifierCV` - :class:`sklearn.compose.ColumnTransformer` +- :class:`sklearn.ensemble.BaggingClassifier` +- :class:`sklearn.ensemble.BaggingRegressor` - :class:`sklearn.feature_selection.SelectFromModel` - :class:`sklearn.impute.IterativeImputer` - :class:`sklearn.linear_model.ElasticNetCV` @@ -306,8 +308,6 @@ Meta-estimators and tools not supporting metadata routing yet: - :class:`sklearn.covariance.GraphicalLassoCV` - :class:`sklearn.ensemble.AdaBoostClassifier` - :class:`sklearn.ensemble.AdaBoostRegressor` -- :class:`sklearn.ensemble.BaggingClassifier` -- :class:`sklearn.ensemble.BaggingRegressor` - :class:`sklearn.ensemble.StackingClassifier` - :class:`sklearn.ensemble.StackingRegressor` - :class:`sklearn.ensemble.VotingClassifier` diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index c0c96f14ae612..6157609512555 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -83,6 +83,12 @@ Changelog pre-sorting the data before finding the thresholds for binning. :pr:`28102` by :user:`Christian Lorentzen `. +- |Enhancement| :class:`ensemble.BaggingClassifier` and :class:`ensemble.BaggingRegressor` + now supports metadata routing. The fit methods now + accepts ``**fit_params`` which are passed to the underlying estimators + via their `fit` methods. + :pr:`` by :user:`Adam Li `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index da340ceec6fe4..d1d1ba3a97680 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -13,7 +13,7 @@ import numpy as np -from ..base import ClassifierMixin, RegressorMixin, _fit_context +from ..base import Bunch, ClassifierMixin, RegressorMixin, _fit_context from ..metrics import accuracy_score, r2_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..utils import check_random_state, column_or_1d, indices_to_mask @@ -21,7 +21,11 @@ from ..utils._tags import _safe_tags from ..utils.metadata_routing import ( _raise_for_unsupported_routing, - _RoutingNotSupportedMixin, + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, ) from ..utils.metaestimators import available_if from ..utils.multiclass import check_classification_targets @@ -298,7 +302,7 @@ def __init__( # BaseBagging.estimator is not validated yet prefer_skip_nested_validation=False ) - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, **fit_params): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters @@ -316,12 +320,35 @@ def fit(self, X, y, sample_weight=None): Note that this is supported only if the base estimator supports sample weighting. + **fit_params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.5 + + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + Returns ------- self : object Fitted estimator. """ - _raise_for_unsupported_routing(self, "fit", sample_weight=sample_weight) + _raise_for_params(self, "fit") + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch() + for name in names: + routed_params[name] = Bunch(fit={}) + if "sample_weight" in fit_params: + routed_params[name].fit["sample_weight"] = fit_params[ + "sample_weight" + ] + # Convert data (X is required to be 2d and indexable) X, y = self._validate_data( X, @@ -537,8 +564,32 @@ def estimators_samples_(self): """ return [sample_indices for _, sample_indices in self._get_estimators_indices()] + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + + # `self.estimators` is a list of (name, est) tuples + for name, estimator in self.estimators: + router.add( + **{name: estimator}, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + return router + -class BaggingClassifier(_RoutingNotSupportedMixin, ClassifierMixin, BaseBagging): +class BaggingClassifier(ClassifierMixin, BaseBagging): """A Bagging classifier. A Bagging classifier is an ensemble meta-estimator that fits base @@ -970,7 +1021,7 @@ def _more_tags(self): return {"allow_nan": _safe_tags(estimator, "allow_nan")} -class BaggingRegressor(_RoutingNotSupportedMixin, RegressorMixin, BaseBagging): +class BaggingRegressor(RegressorMixin, BaseBagging): """A Bagging regressor. A Bagging regressor is an ensemble meta-estimator that fits base diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 2c1e308cee33b..5d7c4806ab105 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -5,12 +5,14 @@ # Author: Gilles Louppe # License: BSD 3 clause from itertools import cycle, product +import re import joblib import numpy as np import pytest from sklearn.base import BaseEstimator +from sklearn.calibration import CalibratedClassifierCV from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.ensemble import ( @@ -31,6 +33,12 @@ from sklearn.utils import check_random_state from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingRegressor, + _Registry, + check_recorded_metadata, +) rng = check_random_state(0) @@ -936,3 +944,95 @@ def fit(self, X, y): def test_bagging_allow_nan_tag(bagging, expected_allow_nan): """Check that bagging inherits allow_nan tag.""" assert bagging._get_tags()["allow_nan"] == expected_allow_nan + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.parametrize( + "Estimator, BaseEst", + [(BaggingClassifier, ConsumingClassifier), (BaggingRegressor, ConsumingRegressor)], +) +def test_routing_passed_metadata_not_supported(Estimator, BaseEst): + """Test that the right error message is raised when metadata is passed while + not supported when `enable_metadata_routing=False`.""" + + X = np.array([[0, 1], [2, 2], [4, 6]]) + y = [1, 2, 3] + + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + Estimator(estimator=BaseEst()).fit(X, y, sample_weight=[1, 1, 1], metadata="a") + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "Estimator, BaseEst", + [(BaggingClassifier, ConsumingClassifier), (BaggingRegressor, ConsumingRegressor)], +) +def test_get_metadata_routing_without_fit(Estimator, BaseEst): + # Test that metadata_routing() doesn't raise when called before fit. + est = Estimator(estimator=BaseEst()) + est.get_metadata_routing() + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "Estimator, BaseEst", + [(BaggingClassifier, ConsumingClassifier), (BaggingRegressor, ConsumingRegressor)], +) +@pytest.mark.parametrize("prop", ["sample_weight", "metadata"]) +def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): + """Test that metadata is routed correctly for Bagging*.""" + X = np.array([[0, 1], [2, 2], [4, 6]]) + y = [1, 2, 3] + sample_weight, metadata = [1, 1, 1], "a" + + est = Estimator( + estimator=BaseEst(registry=_Registry()).set_fit_request(**{prop: True}) + ) + + est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata}) + + for estimator in est.estimators: + if prop == "sample_weight": + kwargs = {prop: sample_weight} + else: + kwargs = {prop: metadata} + # access sub-estimator in (name, est) with estimator[1] + registry = estimator[1].registry + assert len(registry) + for sub_est in registry: + check_recorded_metadata( + obj=sub_est, + method="fit", + **kwargs, + ) + + +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "Estimator, BaseEst", + [(BaggingClassifier, ConsumingClassifier), (BaggingClassifier, ConsumingRegressor)], +) +def test_metadata_routing_error_for_bagging_estimators(Estimator, BaseEst): + """Test that the right error is raised when metadata is not requested.""" + X = np.array([[0, 1], [2, 2], [4, 6]]) + y = [1, 2, 3] + sample_weight, metadata = [1, 1, 1], "a" + + est = Estimator(estimator=BaseEst()) + + error_message = ( + "[sample_weight, metadata] are passed but are not explicitly set as requested" + f" or not for {BaseEst.__name__}.fit" + ) + + with pytest.raises(ValueError, match=re.escape(error_message)): + est.fit(X, y, sample_weight=sample_weight, metadata=metadata) + + +# End of Metadata Routing Tests +# ============================= diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 9aa305e11e6fd..880b6217ba38f 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -12,8 +12,6 @@ from sklearn.ensemble import ( AdaBoostClassifier, AdaBoostRegressor, - BaggingClassifier, - BaggingRegressor, StackingClassifier, StackingRegressor, VotingClassifier, @@ -336,8 +334,6 @@ def enable_slep006(): UNSUPPORTED_ESTIMATORS = [ AdaBoostClassifier(), AdaBoostRegressor(), - BaggingClassifier(), - BaggingRegressor(), FeatureUnion([]), GraphicalLassoCV(), RANSACRegressor(), From 6fc50fcc986c7d81f480256099f3c15026daeaf7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Feb 2024 16:46:30 -0500 Subject: [PATCH 02/22] Adding metadata routing Signed-off-by: Adam Li --- doc/whats_new/v1.5.rst | 2 +- sklearn/ensemble/_bagging.py | 111 +++++++++++++++-------- sklearn/ensemble/tests/test_bagging.py | 18 ++-- sklearn/tests/metadata_routing_common.py | 10 +- 4 files changed, 88 insertions(+), 53 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 6157609512555..38994f361dcc3 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -87,7 +87,7 @@ Changelog now supports metadata routing. The fit methods now accepts ``**fit_params`` which are passed to the underlying estimators via their `fit` methods. - :pr:`` by :user:`Adam Li `. + :pr:`28432` by :user:`Adam Li `. :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index d1d1ba3a97680..88e2a74d9b8a4 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -13,17 +13,21 @@ import numpy as np -from ..base import Bunch, ClassifierMixin, RegressorMixin, _fit_context +from ..base import ClassifierMixin, RegressorMixin, _fit_context from ..metrics import accuracy_score, r2_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_random_state, column_or_1d, indices_to_mask +from ..utils import ( + Bunch, + _safe_indexing, + check_random_state, + column_or_1d, + indices_to_mask, +) from ..utils._param_validation import HasMethods, Interval, RealNotInt from ..utils._tags import _safe_tags from ..utils.metadata_routing import ( - _raise_for_unsupported_routing, MetadataRouter, MethodMapping, - _raise_for_params, _routing_enabled, process_routing, ) @@ -31,7 +35,13 @@ from ..utils.multiclass import check_classification_targets from ..utils.parallel import Parallel, delayed from ..utils.random import sample_without_replacement -from ..utils.validation import _check_sample_weight, check_is_fitted, has_fit_parameter +from ..utils.validation import ( + _check_method_params, + _check_sample_weight, + _deprecate_positional_args, + check_is_fitted, + has_fit_parameter, +) from ._base import BaseEnsemble, _partition_estimators __all__ = ["BaggingClassifier", "BaggingRegressor"] @@ -81,11 +91,11 @@ def _parallel_build_estimators( ensemble, X, y, - sample_weight, seeds, total_n_estimators, verbose, check_input, + fit_params, ): """Private function used to build a batch of estimators within a job.""" # Retrieve settings @@ -94,17 +104,16 @@ def _parallel_build_estimators( max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features - support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight") has_check_input = has_fit_parameter(ensemble.estimator_, "check_input") requires_feature_indexing = bootstrap_features or max_features != n_features - if not support_sample_weight and sample_weight is not None: - raise ValueError("The base estimator doesn't support sample weight") - # Build estimators estimators = [] estimators_features = [] + # Row sampling can be achieved either through setting sample_weight or + # by indexing. The former is more efficient. Therefore, use this method + # if possible, otherwise use indexing. for i in range(n_estimators): if verbose > 1: print( @@ -131,12 +140,14 @@ def _parallel_build_estimators( max_samples, ) + # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata + # routing can't be disabled. # Draw samples, using sample weights, and then fit - if support_sample_weight: - if sample_weight is None: - curr_sample_weight = np.ones((n_samples,)) - else: - curr_sample_weight = sample_weight.copy() + if not _routing_enabled() and "sample_weight" in fit_params: + # row subsampling via sample_weight + curr_sample_weight = _check_sample_weight( + fit_params.pop("sample_weight", None), X + ).copy() if bootstrap: sample_counts = np.bincount(indices, minlength=n_samples) @@ -146,10 +157,15 @@ def _parallel_build_estimators( curr_sample_weight[not_indices_mask] = 0 X_ = X[:, features] if requires_feature_indexing else X - estimator_fit(X_, y, sample_weight=curr_sample_weight) + estimator_fit(X_, y, sample_weight=curr_sample_weight, **fit_params) else: - X_ = X[indices][:, features] if requires_feature_indexing else X[indices] - estimator_fit(X_, y[indices]) + # cannot use sample_weight, use indexing + y_ = _safe_indexing(y, indices) + X_ = _safe_indexing(X, indices) + fit_params = _check_method_params(X, params=fit_params, indices=indices) + if requires_feature_indexing: + X_ = X_[:, features] + estimator_fit(X_, y_, **fit_params) estimators.append(estimator) estimators_features.append(features) @@ -298,11 +314,15 @@ def __init__( self.random_state = random_state self.verbose = verbose + # TODO(1.7): remove `sample_weight` from the signature after deprecation + # cycle; pop it from `fit_params` before the `_raise_for_params` check and + # reinsert later, for backwards compatibility + @_deprecate_positional_args(version="1.7") @_fit_context( # BaseBagging.estimator is not validated yet prefer_skip_nested_validation=False ) - def fit(self, X, y, **fit_params): + def fit(self, X, y, *, sample_weight=None, **fit_params): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters @@ -336,18 +356,12 @@ def fit(self, X, y, **fit_params): self : object Fitted estimator. """ - _raise_for_params(self, "fit") - - if _routing_enabled(): - routed_params = process_routing(self, "fit", **fit_params) - else: - routed_params = Bunch() - for name in names: - routed_params[name] = Bunch(fit={}) - if "sample_weight" in fit_params: - routed_params[name].fit["sample_weight"] = fit_params[ - "sample_weight" - ] + if fit_params and not _routing_enabled(): + raise ValueError( + "params is only supported if enable_metadata_routing=True. See the User" + " Guide at https://scikit-learn.org/stable/metadata_routing.html for" + " more information." + ) # Convert data (X is required to be 2d and indexable) X, y = self._validate_data( @@ -358,7 +372,10 @@ def fit(self, X, y, **fit_params): force_all_finite=False, multi_output=True, ) - return self._fit(X, y, self.max_samples, sample_weight=sample_weight) + + return self._fit( + X, y, self.max_samples, sample_weight=sample_weight, **fit_params + ) def _parallel_args(self): return {} @@ -371,6 +388,7 @@ def _fit( max_depth=None, sample_weight=None, check_input=True, + **fit_params, ): """Build a Bagging ensemble of estimators from the training set (X, y). @@ -400,6 +418,12 @@ def _fit( check_input : bool, default=True Override value used when fitting base estimator. Only supported if the base estimator has a check_input parameter for fit function. + If the meta-estimator already checks the input, set this value to + False to prevent redundant input validation. + + **fit_params : dict, default=None + Parameters to pass to the :term:`fit` method of the underlying + estimator. Returns ------- @@ -419,6 +443,16 @@ def _fit( # Check parameters self._validate_estimator() + if _routing_enabled(): + routed_params = process_routing( + self, "fit", sample_weight=sample_weight, **fit_params + ) + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=fit_params) + if "sample_weight" != None: + routed_params.estimator.fit["sample_weight"] = sample_weight + if max_depth is not None: self.estimator_.max_depth = max_depth @@ -501,11 +535,11 @@ def _fit( self, X, y, - sample_weight, seeds[starts[i] : starts[i + 1]], total_n_estimators, verbose=self.verbose, check_input=check_input, + fit_params=routed_params.estimator.fit, ) for i in range(n_jobs) ) @@ -579,13 +613,10 @@ def get_metadata_routing(self): routing information. """ router = MetadataRouter(owner=self.__class__.__name__) - - # `self.estimators` is a list of (name, est) tuples - for name, estimator in self.estimators: - router.add( - **{name: estimator}, - method_mapping=MethodMapping().add(callee="fit", caller="fit"), - ) + router.add( + estimator=self.estimator, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) return router diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 5d7c4806ab105..3b755edd5f41b 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -4,15 +4,14 @@ # Author: Gilles Louppe # License: BSD 3 clause -from itertools import cycle, product import re +from itertools import cycle, product import joblib import numpy as np import pytest from sklearn.base import BaseEstimator -from sklearn.calibration import CalibratedClassifierCV from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.ensemble import ( @@ -29,16 +28,16 @@ from sklearn.preprocessing import FunctionTransformer, scale from sklearn.random_projection import SparseRandomProjection from sklearn.svm import SVC, SVR -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal -from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS from sklearn.tests.metadata_routing_common import ( ConsumingClassifier, ConsumingRegressor, _Registry, check_recorded_metadata, ) +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS rng = check_random_state(0) @@ -988,7 +987,7 @@ def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): """Test that metadata is routed correctly for Bagging*.""" X = np.array([[0, 1], [2, 2], [4, 6]]) y = [1, 2, 3] - sample_weight, metadata = [1, 1, 1], "a" + sample_weight, metadata = [1.0, 1.0, 1.0], "a" est = Estimator( estimator=BaseEst(registry=_Registry()).set_fit_request(**{prop: True}) @@ -996,13 +995,12 @@ def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata}) - for estimator in est.estimators: + for estimator in est.estimators_: if prop == "sample_weight": kwargs = {prop: sample_weight} else: kwargs = {prop: metadata} - # access sub-estimator in (name, est) with estimator[1] - registry = estimator[1].registry + registry = estimator.registry assert len(registry) for sub_est in registry: check_recorded_metadata( diff --git a/sklearn/tests/metadata_routing_common.py b/sklearn/tests/metadata_routing_common.py index e330cd3960aeb..c075cbca9acbe 100644 --- a/sklearn/tests/metadata_routing_common.py +++ b/sklearn/tests/metadata_routing_common.py @@ -1,6 +1,7 @@ from functools import partial import numpy as np +from numpy.testing import assert_array_equal from sklearn.base import ( BaseEstimator, @@ -56,7 +57,9 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): of the original values. """ records = getattr(obj, "_records", dict()).get(method, dict()) - assert set(kwargs.keys()) == set(records.keys()) + assert set(kwargs.keys()) == set( + records.keys() + ), f"Expected {kwargs.keys()} vs {records.keys()}" for key, value in kwargs.items(): recorded_value = records[key] # The following condition is used to check for any specified parameters @@ -64,7 +67,10 @@ def check_recorded_metadata(obj, method, split_params=tuple(), **kwargs): if key in split_params and recorded_value is not None: assert np.isin(recorded_value, value).all() else: - assert recorded_value is value + if isinstance(recorded_value, np.ndarray): + assert_array_equal(recorded_value, value) + else: + assert recorded_value is value, f"Expected {recorded_value} vs {value}" record_metadata_not_default = partial(record_metadata, record_default=False) From a14913a04f5b8da3acbe7ddf14f809ba5d26394d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Feb 2024 16:51:46 -0500 Subject: [PATCH 03/22] Raise for params Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 88e2a74d9b8a4..49400fe42f92c 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -30,6 +30,7 @@ MethodMapping, _routing_enabled, process_routing, + _raise_for_params, ) from ..utils.metaestimators import available_if from ..utils.multiclass import check_classification_targets @@ -356,12 +357,7 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): self : object Fitted estimator. """ - if fit_params and not _routing_enabled(): - raise ValueError( - "params is only supported if enable_metadata_routing=True. See the User" - " Guide at https://scikit-learn.org/stable/metadata_routing.html for" - " more information." - ) + _raise_for_params(fit_params, self, "fit") # Convert data (X is required to be 2d and indexable) X, y = self._validate_data( From 351f2b29e2f47ef3e0bcf5728dd633a2859a60f4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Feb 2024 16:55:20 -0500 Subject: [PATCH 04/22] Fix lint Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 49400fe42f92c..eef5e53977bc6 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -28,9 +28,9 @@ from ..utils.metadata_routing import ( MetadataRouter, MethodMapping, + _raise_for_params, _routing_enabled, process_routing, - _raise_for_params, ) from ..utils.metaestimators import available_if from ..utils.multiclass import check_classification_targets From 52e99a938605e4b9561cf8625274b9377f8dbed4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Feb 2024 22:54:45 -0500 Subject: [PATCH 05/22] Address final test Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 76 +++++++++++++++++++++++--- sklearn/ensemble/tests/test_bagging.py | 14 +++-- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index eef5e53977bc6..8749c741922b6 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -31,6 +31,7 @@ _raise_for_params, _routing_enabled, process_routing, + get_routing_for_object, ) from ..utils.metaestimators import available_if from ..utils.multiclass import check_classification_targets @@ -87,6 +88,14 @@ def _generate_bagging_indices( return feature_indices, sample_indices +def _supports_sample_weight(request_or_router): + """Check if the fit method supports sample_weight""" + param_names = request_or_router._get_param_names( + method="fit", return_alias=True, ignore_self_request=False + ) + return "sample_weight" in param_names + + def _parallel_build_estimators( n_estimators, ensemble, @@ -105,6 +114,7 @@ def _parallel_build_estimators( max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features + support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight") has_check_input = has_fit_parameter(ensemble.estimator_, "check_input") requires_feature_indexing = bootstrap_features or max_features != n_features @@ -112,6 +122,17 @@ def _parallel_build_estimators( estimators = [] estimators_features = [] + request_or_router = get_routing_for_object(ensemble.estimator_) + + if ( + not _supports_sample_weight(request_or_router) + and fit_params.get("sample_weight") is not None + ): + raise ValueError( + "The base estimator doesn't support sample weight, but sample_weight is " + "passed to the fit method." + ) + # Row sampling can be achieved either through setting sample_weight or # by indexing. The former is more efficient. Therefore, use this method # if possible, otherwise use indexing. @@ -125,6 +146,7 @@ def _parallel_build_estimators( random_state = seeds[i] estimator = ensemble._make_estimator(append=False, random_state=random_state) + records = getattr(estimator, "_records", dict()).get('fit', dict()) if has_check_input: estimator_fit = partial(estimator.fit, check_input=check_input) else: @@ -141,13 +163,14 @@ def _parallel_build_estimators( max_samples, ) + fit_params_ = fit_params.copy() # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata # routing can't be disabled. # Draw samples, using sample weights, and then fit - if not _routing_enabled() and "sample_weight" in fit_params: + if support_sample_weight: # row subsampling via sample_weight curr_sample_weight = _check_sample_weight( - fit_params.pop("sample_weight", None), X + fit_params_.pop("sample_weight", None), X ).copy() if bootstrap: @@ -157,17 +180,26 @@ def _parallel_build_estimators( not_indices_mask = ~indices_to_mask(indices, n_samples) curr_sample_weight[not_indices_mask] = 0 + fit_params_["sample_weight"] = curr_sample_weight X_ = X[:, features] if requires_feature_indexing else X - estimator_fit(X_, y, sample_weight=curr_sample_weight, **fit_params) + if not _routing_enabled(): + estimator_fit(X_, y, sample_weight=curr_sample_weight) + else: + estimator_fit(X_, y, **fit_params_) else: + # if sample_weight is not supported in the Bagging estimator, pop it from fit_params + fit_params_.pop("sample_weight", None) + # cannot use sample_weight, use indexing y_ = _safe_indexing(y, indices) X_ = _safe_indexing(X, indices) - fit_params = _check_method_params(X, params=fit_params, indices=indices) + fit_params_ = _check_method_params(X, params=fit_params_, indices=indices) if requires_feature_indexing: X_ = X_[:, features] - estimator_fit(X_, y_, **fit_params) + estimator_fit(X_, y_, **fit_params_) + records = getattr(estimator, "_records", dict()).get('fit', dict()) + print('Next here...' , records) estimators.append(estimator) estimators_features.append(features) @@ -615,6 +647,10 @@ def get_metadata_routing(self): ) return router + @abstractmethod + def _get_estimator(self): + """Resolve which estimator to return.""" + class BaggingClassifier(ClassifierMixin, BaseBagging): """A Bagging classifier. @@ -815,7 +851,21 @@ def __init__( def _validate_estimator(self): """Check the estimator and set the estimator_ attribute.""" - super()._validate_estimator(default=DecisionTreeClassifier()) + super()._validate_estimator(default=self._get_estimator()) + + def _get_estimator(self): + """Resolve which estimator to return (default is DecisionTreeClassifier)""" + estimator = self.estimator + + if estimator is None: + # we want all classifiers that don't expose a random_state + # to be deterministic (and we don't want to expose this one). + estimator = DecisionTreeClassifier() + + if _routing_enabled(): + estimator.set_fit_request(sample_weight=True) + + return estimator def _set_oob_score(self, X, y): n_samples = y.shape[0] @@ -1282,7 +1332,7 @@ def predict(self, X): def _validate_estimator(self): """Check the estimator and set the estimator_ attribute.""" - super()._validate_estimator(default=DecisionTreeRegressor()) + super()._validate_estimator(default=self._get_estimator()) def _set_oob_score(self, X, y): n_samples = y.shape[0] @@ -1318,3 +1368,15 @@ def _more_tags(self): else: estimator = self.estimator return {"allow_nan": _safe_tags(estimator, "allow_nan")} + + def _get_estimator(self): + """Resolve which estimator to return (default is DecisionTreeClassifier)""" + estimator = self.estimator + + if estimator is None: + # we want all classifiers that don't expose a random_state + # to be deterministic (and we don't want to expose this one). + estimator = DecisionTreeRegressor() + if _routing_enabled(): + estimator.set_fit_request(sample_weight=True) + return estimator diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 3b755edd5f41b..737efb06e71e1 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -38,6 +38,8 @@ from sklearn.utils import check_random_state from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS +from sklearn.utils.validation import has_fit_parameter + rng = check_random_state(0) @@ -993,6 +995,7 @@ def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): estimator=BaseEst(registry=_Registry()).set_fit_request(**{prop: True}) ) + print({prop: sample_weight if prop == "sample_weight" else metadata}) est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata}) for estimator in est.estimators_: @@ -1003,11 +1006,12 @@ def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): registry = estimator.registry assert len(registry) for sub_est in registry: - check_recorded_metadata( - obj=sub_est, - method="fit", - **kwargs, - ) + if not has_fit_parameter(sub_est, "sample_weight"): + check_recorded_metadata( + obj=sub_est, + method="fit", + **kwargs, + ) @pytest.mark.usefixtures("enable_slep006") From 9558c18697ebbffefee921be17e19db6eff08099 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Feb 2024 23:06:31 -0500 Subject: [PATCH 06/22] Fix lint' -s Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 11 ++++++----- sklearn/ensemble/tests/test_bagging.py | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 8749c741922b6..bd8156f7f687e 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -30,8 +30,8 @@ MethodMapping, _raise_for_params, _routing_enabled, - process_routing, get_routing_for_object, + process_routing, ) from ..utils.metaestimators import available_if from ..utils.multiclass import check_classification_targets @@ -146,7 +146,7 @@ def _parallel_build_estimators( random_state = seeds[i] estimator = ensemble._make_estimator(append=False, random_state=random_state) - records = getattr(estimator, "_records", dict()).get('fit', dict()) + records = getattr(estimator, "_records", dict()).get("fit", dict()) if has_check_input: estimator_fit = partial(estimator.fit, check_input=check_input) else: @@ -187,7 +187,8 @@ def _parallel_build_estimators( else: estimator_fit(X_, y, **fit_params_) else: - # if sample_weight is not supported in the Bagging estimator, pop it from fit_params + # if sample_weight is not supported in the Bagging estimator, pop it + # from fit_params fit_params_.pop("sample_weight", None) # cannot use sample_weight, use indexing @@ -198,8 +199,8 @@ def _parallel_build_estimators( X_ = X_[:, features] estimator_fit(X_, y_, **fit_params_) - records = getattr(estimator, "_records", dict()).get('fit', dict()) - print('Next here...' , records) + records = getattr(estimator, "_records", dict()).get("fit", dict()) + print("Next here...", records) estimators.append(estimator) estimators_features.append(features) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 737efb06e71e1..01bf2991eabd3 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -40,7 +40,6 @@ from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS from sklearn.utils.validation import has_fit_parameter - rng = check_random_state(0) # also load the iris dataset From fc77399ee6cddc5aa8e874d1741b82a1c4bc82d1 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Feb 2024 10:06:36 -0500 Subject: [PATCH 07/22] Fix iforest and print statements Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 1 - sklearn/ensemble/_iforest.py | 12 +++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index bd8156f7f687e..e702bb91682c2 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -200,7 +200,6 @@ def _parallel_build_estimators( estimator_fit(X_, y_, **fit_params_) records = getattr(estimator, "_records", dict()).get("fit", dict()) - print("Next here...", records) estimators.append(estimator) estimators_features.append(features) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index c975f121798f0..870bd9cfd9b6c 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -232,9 +232,7 @@ def __init__( warm_start=False, ): super().__init__( - estimator=ExtraTreeRegressor( - max_features=1, splitter="random", random_state=random_state - ), + estimator=None, # here above max_features has no links with self.max_features bootstrap=bootstrap, bootstrap_features=False, @@ -249,6 +247,14 @@ def __init__( self.contamination = contamination + def _get_estimator(self): + return ExtraTreeRegressor( + # here max_features has no links with self.max_features + max_features=1, + splitter="random", + random_state=self.random_state, + ) + def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by iforest") From e7a392cf1f389bff590cf0278057245e7d900ecd Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Feb 2024 10:10:23 -0500 Subject: [PATCH 08/22] Fix lint Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index e702bb91682c2..cf9ec3a3bd460 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -146,7 +146,7 @@ def _parallel_build_estimators( random_state = seeds[i] estimator = ensemble._make_estimator(append=False, random_state=random_state) - records = getattr(estimator, "_records", dict()).get("fit", dict()) + getattr(estimator, "_records", dict()).get("fit", dict()) if has_check_input: estimator_fit = partial(estimator.fit, check_input=check_input) else: @@ -199,7 +199,7 @@ def _parallel_build_estimators( X_ = X_[:, features] estimator_fit(X_, y_, **fit_params_) - records = getattr(estimator, "_records", dict()).get("fit", dict()) + getattr(estimator, "_records", dict()).get("fit", dict()) estimators.append(estimator) estimators_features.append(features) From 9d23eb2f5ca74b6242188335424c9ed1ff786c97 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Feb 2024 10:29:12 -0500 Subject: [PATCH 09/22] Fix bagging and isolation forest Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index cf9ec3a3bd460..134c5c4bde7c9 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -469,7 +469,7 @@ def _fit( y = self._validate_y(y) # Check parameters - self._validate_estimator() + self._validate_estimator(self._get_estimator()) if _routing_enabled(): routed_params = process_routing( @@ -849,10 +849,6 @@ def __init__( verbose=verbose, ) - def _validate_estimator(self): - """Check the estimator and set the estimator_ attribute.""" - super()._validate_estimator(default=self._get_estimator()) - def _get_estimator(self): """Resolve which estimator to return (default is DecisionTreeClassifier)""" estimator = self.estimator @@ -1330,10 +1326,6 @@ def predict(self, X): return y_hat - def _validate_estimator(self): - """Check the estimator and set the estimator_ attribute.""" - super()._validate_estimator(default=self._get_estimator()) - def _set_oob_score(self, X, y): n_samples = y.shape[0] From f0a93e58c5a23e4fcc2b1b6169f17244858b0ea4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 17 Feb 2024 12:17:03 -0500 Subject: [PATCH 10/22] Fix test Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 50 ++++++++++---------------- sklearn/ensemble/tests/test_bagging.py | 26 ++++++++++++-- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 134c5c4bde7c9..ddeb299b6e2bb 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -164,10 +164,17 @@ def _parallel_build_estimators( ) fit_params_ = fit_params.copy() + # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata # routing can't be disabled. + # 1. If routing is not enabled, we will check if the base + # estimator supports sample_weight and use it if it does. + # 2. If routing is enabled, we will check if the routing supports sample + # weight and use it if it does. # Draw samples, using sample weights, and then fit - if support_sample_weight: + if (_routing_enabled() and "sample_weight" in fit_params_) or ( + not _routing_enabled() and support_sample_weight + ): # row subsampling via sample_weight curr_sample_weight = _check_sample_weight( fit_params_.pop("sample_weight", None), X @@ -182,15 +189,8 @@ def _parallel_build_estimators( fit_params_["sample_weight"] = curr_sample_weight X_ = X[:, features] if requires_feature_indexing else X - if not _routing_enabled(): - estimator_fit(X_, y, sample_weight=curr_sample_weight) - else: - estimator_fit(X_, y, **fit_params_) + estimator_fit(X_, y, **fit_params_) else: - # if sample_weight is not supported in the Bagging estimator, pop it - # from fit_params - fit_params_.pop("sample_weight", None) - # cannot use sample_weight, use indexing y_ = _safe_indexing(y, indices) X_ = _safe_indexing(X, indices) @@ -401,9 +401,11 @@ def fit(self, X, y, *, sample_weight=None, **fit_params): multi_output=True, ) - return self._fit( - X, y, self.max_samples, sample_weight=sample_weight, **fit_params - ) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=None) + fit_params["sample_weight"] = sample_weight + + return self._fit(X, y, max_samples=self.max_samples, **fit_params) def _parallel_args(self): return {} @@ -414,7 +416,6 @@ def _fit( y, max_samples=None, max_depth=None, - sample_weight=None, check_input=True, **fit_params, ): @@ -438,11 +439,6 @@ def _fit( Override value used when constructing base estimator. Only supported if the base estimator has a max_depth parameter. - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. If None, then samples are equally weighted. - Note that this is supported only if the base estimator supports - sample weighting. - check_input : bool, default=True Override value used when fitting base estimator. Only supported if the base estimator has a check_input parameter for fit function. @@ -460,9 +456,6 @@ def _fit( """ random_state = check_random_state(self.random_state) - if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, dtype=None) - # Remap output n_samples = X.shape[0] self._n_samples = n_samples @@ -472,14 +465,14 @@ def _fit( self._validate_estimator(self._get_estimator()) if _routing_enabled(): - routed_params = process_routing( - self, "fit", sample_weight=sample_weight, **fit_params - ) + routed_params = process_routing(self, "fit", **fit_params) else: routed_params = Bunch() routed_params.estimator = Bunch(fit=fit_params) - if "sample_weight" != None: - routed_params.estimator.fit["sample_weight"] = sample_weight + if "sample_weight" in fit_params: + routed_params.estimator.fit["sample_weight"] = fit_params[ + "sample_weight" + ] if max_depth is not None: self.estimator_.max_depth = max_depth @@ -858,9 +851,6 @@ def _get_estimator(self): # to be deterministic (and we don't want to expose this one). estimator = DecisionTreeClassifier() - if _routing_enabled(): - estimator.set_fit_request(sample_weight=True) - return estimator def _set_oob_score(self, X, y): @@ -1369,6 +1359,4 @@ def _get_estimator(self): # we want all classifiers that don't expose a random_state # to be deterministic (and we don't want to expose this one). estimator = DecisionTreeRegressor() - if _routing_enabled(): - estimator.set_fit_request(sample_weight=True) return estimator diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 01bf2991eabd3..09d5c8f994004 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -978,6 +978,24 @@ def test_get_metadata_routing_without_fit(Estimator, BaseEst): est.get_metadata_routing() +@pytest.mark.usefixtures("enable_slep006") +@pytest.mark.parametrize( + "Estimator", + [BaggingClassifier, BaggingRegressor], +) +def test_get_metadata_routing_with_default_estimator(Estimator): + """Test that metadata routing is off by default estimator.""" + X = np.array([[0, 1], [2, 2], [4, 6]]) + y = [1, 2, 3] + sample_weight = [1.0, 1.0, 1.0] + est = Estimator(estimator=None, n_estimators=2, random_state=0) + + with pytest.raises( + TypeError, match="got unexpected argument\(s\) {'sample_weight'}" + ): + est.fit(X, y, sample_weight=sample_weight) + + @pytest.mark.usefixtures("enable_slep006") @pytest.mark.parametrize( "Estimator, BaseEst", @@ -994,7 +1012,6 @@ def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): estimator=BaseEst(registry=_Registry()).set_fit_request(**{prop: True}) ) - print({prop: sample_weight if prop == "sample_weight" else metadata}) est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata}) for estimator in est.estimators_: @@ -1005,7 +1022,12 @@ def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): registry = estimator.registry assert len(registry) for sub_est in registry: - if not has_fit_parameter(sub_est, "sample_weight"): + # When sample weight is passed in, the sub-estimators use + # bootstrapping to sample from the dataset, so we can't check + # that sample weight is the same within all sub-estimators. + # + # Note: We can check all other metadata is the same though. + if prop != "sample_weight": check_recorded_metadata( obj=sub_est, method="fit", From 294c8f179c0737a3e003a62fba6f28cda8541a2e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 17 Feb 2024 12:17:21 -0500 Subject: [PATCH 11/22] Fix lint Signed-off-by: Adam Li --- sklearn/ensemble/tests/test_bagging.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 09d5c8f994004..6f15dc95b7761 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -38,7 +38,6 @@ from sklearn.utils import check_random_state from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS -from sklearn.utils.validation import has_fit_parameter rng = check_random_state(0) @@ -991,7 +990,7 @@ def test_get_metadata_routing_with_default_estimator(Estimator): est = Estimator(estimator=None, n_estimators=2, random_state=0) with pytest.raises( - TypeError, match="got unexpected argument\(s\) {'sample_weight'}" + TypeError, match=r"got unexpected argument\(s\) {'sample_weight'}" ): est.fit(X, y, sample_weight=sample_weight) From a675a658929b31ccea0b57f0cd40f944756e3970 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Feb 2024 10:10:53 -0500 Subject: [PATCH 12/22] Apply suggestions from code review Co-authored-by: Adrin Jalali --- sklearn/ensemble/_bagging.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index ddeb299b6e2bb..a75460e9553fa 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -635,7 +635,7 @@ def get_metadata_routing(self): """ router = MetadataRouter(owner=self.__class__.__name__) router.add( - estimator=self.estimator, + estimator=self._get_estimator(), method_mapping=MethodMapping().add(callee="fit", caller="fit"), ) return router @@ -844,14 +844,7 @@ def __init__( def _get_estimator(self): """Resolve which estimator to return (default is DecisionTreeClassifier)""" - estimator = self.estimator - - if estimator is None: - # we want all classifiers that don't expose a random_state - # to be deterministic (and we don't want to expose this one). - estimator = DecisionTreeClassifier() - - return estimator + return self.estimator or DecisionTreeClassifier() def _set_oob_score(self, X, y): n_samples = y.shape[0] @@ -1353,10 +1346,4 @@ def _more_tags(self): def _get_estimator(self): """Resolve which estimator to return (default is DecisionTreeClassifier)""" - estimator = self.estimator - - if estimator is None: - # we want all classifiers that don't expose a random_state - # to be deterministic (and we don't want to expose this one). - estimator = DecisionTreeRegressor() - return estimator + return self.estimator or DecisionTreeRegressor() From 134a60f548b7b6eea3eb51303d32bd6af8390fab Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Feb 2024 10:56:19 -0500 Subject: [PATCH 13/22] Address adrin comments Signed-off-by: Adam Li --- doc/whats_new/v1.5.rst | 11 +++--- sklearn/ensemble/_bagging.py | 39 +++++++------------ sklearn/ensemble/tests/test_bagging.py | 3 +- .../_middle_term_computer.pyx.tp | 8 +--- .../_radius_neighbors.pyx.tp | 7 +--- .../test_metaestimators_metadata_routing.py | 18 +++++++++ 6 files changed, 42 insertions(+), 44 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index d9a196aba349e..9ff336807449d 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -42,6 +42,11 @@ more details. - |Feature| :class:`impute.IterativeImputer` now supports metadata routing in its `fit` method. :pr:`28187` by :user:`Stefanie Senger `. +- |Feature| :class:`ensemble.BaggingClassifier` and :class:`ensemble.BaggingRegressor` + now supports metadata routing. The fit methods now + accepts ``**fit_params`` which are passed to the underlying estimators + via their `fit` methods. + :pr:`28432` by :user:`Adam Li ` and :user:`Benjamin Bossan `. Changelog --------- @@ -83,12 +88,6 @@ Changelog pre-sorting the data before finding the thresholds for binning. :pr:`28102` by :user:`Christian Lorentzen `. -- |Enhancement| :class:`ensemble.BaggingClassifier` and :class:`ensemble.BaggingRegressor` - now supports metadata routing. The fit methods now - accepts ``**fit_params`` which are passed to the underlying estimators - via their `fit` methods. - :pr:`28432` by :user:`Adam Li `. - :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index a75460e9553fa..cfb9621b3d35d 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -88,14 +88,6 @@ def _generate_bagging_indices( return feature_indices, sample_indices -def _supports_sample_weight(request_or_router): - """Check if the fit method supports sample_weight""" - param_names = request_or_router._get_param_names( - method="fit", return_alias=True, ignore_self_request=False - ) - return "sample_weight" in param_names - - def _parallel_build_estimators( n_estimators, ensemble, @@ -124,18 +116,16 @@ def _parallel_build_estimators( request_or_router = get_routing_for_object(ensemble.estimator_) - if ( - not _supports_sample_weight(request_or_router) - and fit_params.get("sample_weight") is not None + # TODO: (slep6) remove if condition for unrouted sample_weight when metadata + # routing can't be disabled. + if not _routing_enabled() and ( + not support_sample_weight and fit_params.get("sample_weight") is not None ): raise ValueError( "The base estimator doesn't support sample weight, but sample_weight is " "passed to the fit method." ) - # Row sampling can be achieved either through setting sample_weight or - # by indexing. The former is more efficient. Therefore, use this method - # if possible, otherwise use indexing. for i in range(n_estimators): if verbose > 1: print( @@ -146,7 +136,6 @@ def _parallel_build_estimators( random_state = seeds[i] estimator = ensemble._make_estimator(append=False, random_state=random_state) - getattr(estimator, "_records", dict()).get("fit", dict()) if has_check_input: estimator_fit = partial(estimator.fit, check_input=check_input) else: @@ -171,11 +160,14 @@ def _parallel_build_estimators( # estimator supports sample_weight and use it if it does. # 2. If routing is enabled, we will check if the routing supports sample # weight and use it if it does. - # Draw samples, using sample weights, and then fit - if (_routing_enabled() and "sample_weight" in fit_params_) or ( - not _routing_enabled() and support_sample_weight - ): - # row subsampling via sample_weight + + # Note: Row sampling can be achieved either through setting sample_weight or + # by indexing. The former is more efficient. Therefore, use this method + # if possible, otherwise use indexing. + if ( + _routing_enabled() and request_or_router.consumes("fit", ("sample_weight",)) + ) or (not _routing_enabled() and support_sample_weight): + # Draw sub samples, using sample weights, and then fit curr_sample_weight = _check_sample_weight( fit_params_.pop("sample_weight", None), X ).copy() @@ -191,7 +183,7 @@ def _parallel_build_estimators( X_ = X[:, features] if requires_feature_indexing else X estimator_fit(X_, y, **fit_params_) else: - # cannot use sample_weight, use indexing + # cannot use sample_weight, so use indexing y_ = _safe_indexing(y, indices) X_ = _safe_indexing(X, indices) fit_params_ = _check_method_params(X, params=fit_params_, indices=indices) @@ -199,7 +191,6 @@ def _parallel_build_estimators( X_ = X_[:, features] estimator_fit(X_, y_, **fit_params_) - getattr(estimator, "_records", dict()).get("fit", dict()) estimators.append(estimator) estimators_features.append(features) @@ -844,7 +835,7 @@ def __init__( def _get_estimator(self): """Resolve which estimator to return (default is DecisionTreeClassifier)""" - return self.estimator or DecisionTreeClassifier() + return self.estimator or DecisionTreeClassifier() def _set_oob_score(self, X, y): n_samples = y.shape[0] @@ -1346,4 +1337,4 @@ def _more_tags(self): def _get_estimator(self): """Resolve which estimator to return (default is DecisionTreeClassifier)""" - return self.estimator or DecisionTreeRegressor() + return self.estimator or DecisionTreeRegressor() diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 6f15dc95b7761..a8f0f4f973d65 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -990,7 +990,8 @@ def test_get_metadata_routing_with_default_estimator(Estimator): est = Estimator(estimator=None, n_estimators=2, random_state=0) with pytest.raises( - TypeError, match=r"got unexpected argument\(s\) {'sample_weight'}" + Exception, + match=r"\[sample_weight\] are passed but are not explicitly set as requested", ): est.fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index bfe465394c4d2..1fca2d674720c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -14,6 +14,7 @@ implementation_specific_values = [ }} from libcpp.vector cimport vector +from libcpp.algorithm cimport fill from ...utils._cython_blas cimport ( BLAS_Order, @@ -25,13 +26,6 @@ from ...utils._cython_blas cimport ( ) from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t -# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used -# Introduction in Cython: -# -# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa -cdef extern from "" namespace "std" nogil: - void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa - import numpy as np from scipy.sparse import issparse, csr_matrix diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index 8695baad172e0..cd25b010cc81b 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -4,6 +4,7 @@ import warnings from libcpp.memory cimport shared_ptr, make_shared from libcpp.vector cimport vector +from libcpp.algorithm cimport move from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange @@ -19,12 +20,6 @@ from ...utils.fixes import threadpool_limits cnp.import_array() -# TODO: change for `libcpp.algorithm.move` once Cython 3 is used -# Introduction in Cython: -# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa -cdef extern from "" namespace "std" nogil: - OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa - ###################### cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 880b6217ba38f..9d03e2dec7052 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -12,6 +12,8 @@ from sklearn.ensemble import ( AdaBoostClassifier, AdaBoostRegressor, + BaggingClassifier, + BaggingRegressor, StackingClassifier, StackingRegressor, VotingClassifier, @@ -296,6 +298,22 @@ def enable_slep006(): "y": y, "estimator_routing_methods": ["fit"], }, + { + "metaestimator": BaggingClassifier, + "estimator_name": "estimator", + "estimator": ConsumingClassifier, + "X": X, + "y": y, + "estimator_routing_methods": ["fit"], + }, + { + "metaestimator": BaggingRegressor, + "estimator_name": "estimator", + "estimator": ConsumingRegressor, + "X": X, + "y": y, + "estimator_routing_methods": ["fit"], + }, ] """List containing all metaestimators to be tested and their settings From 2ffd7b7616116a15d768556b25078ab610d5f606 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Feb 2024 17:53:15 -0500 Subject: [PATCH 14/22] Address adrin's comment Signed-off-by: Adam Li --- sklearn/ensemble/_bagging.py | 2 +- sklearn/ensemble/tests/test_bagging.py | 84 ++----------------- .../test_metaestimators_metadata_routing.py | 2 + 3 files changed, 8 insertions(+), 80 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index cfb9621b3d35d..41205aa3e6d22 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -106,7 +106,6 @@ def _parallel_build_estimators( max_samples = ensemble._max_samples bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features - support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight") has_check_input = has_fit_parameter(ensemble.estimator_, "check_input") requires_feature_indexing = bootstrap_features or max_features != n_features @@ -118,6 +117,7 @@ def _parallel_build_estimators( # TODO: (slep6) remove if condition for unrouted sample_weight when metadata # routing can't be disabled. + support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight") if not _routing_enabled() and ( not support_sample_weight and fit_params.get("sample_weight") is not None ): diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index a8f0f4f973d65..b24f0ebb25ee8 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -4,7 +4,6 @@ # Author: Gilles Louppe # License: BSD 3 clause -import re from itertools import cycle, product import joblib @@ -945,57 +944,6 @@ def test_bagging_allow_nan_tag(bagging, expected_allow_nan): assert bagging._get_tags()["allow_nan"] == expected_allow_nan -# Metadata Routing Tests -# ====================== - - -@pytest.mark.parametrize( - "Estimator, BaseEst", - [(BaggingClassifier, ConsumingClassifier), (BaggingRegressor, ConsumingRegressor)], -) -def test_routing_passed_metadata_not_supported(Estimator, BaseEst): - """Test that the right error message is raised when metadata is passed while - not supported when `enable_metadata_routing=False`.""" - - X = np.array([[0, 1], [2, 2], [4, 6]]) - y = [1, 2, 3] - - with pytest.raises( - ValueError, match="is only supported if enable_metadata_routing=True" - ): - Estimator(estimator=BaseEst()).fit(X, y, sample_weight=[1, 1, 1], metadata="a") - - -@pytest.mark.usefixtures("enable_slep006") -@pytest.mark.parametrize( - "Estimator, BaseEst", - [(BaggingClassifier, ConsumingClassifier), (BaggingRegressor, ConsumingRegressor)], -) -def test_get_metadata_routing_without_fit(Estimator, BaseEst): - # Test that metadata_routing() doesn't raise when called before fit. - est = Estimator(estimator=BaseEst()) - est.get_metadata_routing() - - -@pytest.mark.usefixtures("enable_slep006") -@pytest.mark.parametrize( - "Estimator", - [BaggingClassifier, BaggingRegressor], -) -def test_get_metadata_routing_with_default_estimator(Estimator): - """Test that metadata routing is off by default estimator.""" - X = np.array([[0, 1], [2, 2], [4, 6]]) - y = [1, 2, 3] - sample_weight = [1.0, 1.0, 1.0] - est = Estimator(estimator=None, n_estimators=2, random_state=0) - - with pytest.raises( - Exception, - match=r"\[sample_weight\] are passed but are not explicitly set as requested", - ): - est.fit(X, y, sample_weight=sample_weight) - - @pytest.mark.usefixtures("enable_slep006") @pytest.mark.parametrize( "Estimator, BaseEst", @@ -1003,7 +951,11 @@ def test_get_metadata_routing_with_default_estimator(Estimator): ) @pytest.mark.parametrize("prop", ["sample_weight", "metadata"]) def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): - """Test that metadata is routed correctly for Bagging*.""" + """Test that metadata is routed correctly for Bagging*. + + Bagging differs in that it should preserve all meta-data except for + ``sample_weight``. + """ X = np.array([[0, 1], [2, 2], [4, 6]]) y = [1, 2, 3] sample_weight, metadata = [1.0, 1.0, 1.0], "a" @@ -1033,29 +985,3 @@ def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): method="fit", **kwargs, ) - - -@pytest.mark.usefixtures("enable_slep006") -@pytest.mark.parametrize( - "Estimator, BaseEst", - [(BaggingClassifier, ConsumingClassifier), (BaggingClassifier, ConsumingRegressor)], -) -def test_metadata_routing_error_for_bagging_estimators(Estimator, BaseEst): - """Test that the right error is raised when metadata is not requested.""" - X = np.array([[0, 1], [2, 2], [4, 6]]) - y = [1, 2, 3] - sample_weight, metadata = [1, 1, 1], "a" - - est = Estimator(estimator=BaseEst()) - - error_message = ( - "[sample_weight, metadata] are passed but are not explicitly set as requested" - f" or not for {BaseEst.__name__}.fit" - ) - - with pytest.raises(ValueError, match=re.escape(error_message)): - est.fit(X, y, sample_weight=sample_weight, metadata=metadata) - - -# End of Metadata Routing Tests -# ============================= diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 9d03e2dec7052..2cffa5125e3c2 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -304,6 +304,7 @@ def enable_slep006(): "estimator": ConsumingClassifier, "X": X, "y": y, + "preserves_metadata": False, "estimator_routing_methods": ["fit"], }, { @@ -312,6 +313,7 @@ def enable_slep006(): "estimator": ConsumingRegressor, "X": X, "y": y, + "preserves_metadata": False, "estimator_routing_methods": ["fit"], }, ] From e2dbc63d0343bf4bfcb44a8fb250a9c8d9533d1d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Feb 2024 18:00:06 -0500 Subject: [PATCH 15/22] clean up test to make pass Signed-off-by: Adam Li --- sklearn/tests/test_metaestimators_metadata_routing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index 2cffa5125e3c2..fa0b8045d5f66 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -1,6 +1,6 @@ import copy import re - +from typing import Iterable import numpy as np import pytest From 00625668fd8ae82d90495952025d4c5a12097e26 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Feb 2024 19:57:58 -0500 Subject: [PATCH 16/22] Fix lint Signed-off-by: Adam Li --- sklearn/tests/test_metaestimators_metadata_routing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_metaestimators_metadata_routing.py b/sklearn/tests/test_metaestimators_metadata_routing.py index fa0b8045d5f66..2cffa5125e3c2 100644 --- a/sklearn/tests/test_metaestimators_metadata_routing.py +++ b/sklearn/tests/test_metaestimators_metadata_routing.py @@ -1,6 +1,6 @@ import copy import re -from typing import Iterable + import numpy as np import pytest From 7dd4fb5dab61c6dfd5e3aac5d29667f9774467a3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 20 Feb 2024 10:45:00 -0500 Subject: [PATCH 17/22] Cleanup Signed-off-by: Adam Li --- sklearn/ensemble/tests/test_bagging.py | 49 -------------------------- 1 file changed, 49 deletions(-) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index b24f0ebb25ee8..2c1e308cee33b 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -27,12 +27,6 @@ from sklearn.preprocessing import FunctionTransformer, scale from sklearn.random_projection import SparseRandomProjection from sklearn.svm import SVC, SVR -from sklearn.tests.metadata_routing_common import ( - ConsumingClassifier, - ConsumingRegressor, - _Registry, - check_recorded_metadata, -) from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import check_random_state from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal @@ -942,46 +936,3 @@ def fit(self, X, y): def test_bagging_allow_nan_tag(bagging, expected_allow_nan): """Check that bagging inherits allow_nan tag.""" assert bagging._get_tags()["allow_nan"] == expected_allow_nan - - -@pytest.mark.usefixtures("enable_slep006") -@pytest.mark.parametrize( - "Estimator, BaseEst", - [(BaggingClassifier, ConsumingClassifier), (BaggingRegressor, ConsumingRegressor)], -) -@pytest.mark.parametrize("prop", ["sample_weight", "metadata"]) -def test_metadata_routing_for_bagging_estimators(Estimator, BaseEst, prop): - """Test that metadata is routed correctly for Bagging*. - - Bagging differs in that it should preserve all meta-data except for - ``sample_weight``. - """ - X = np.array([[0, 1], [2, 2], [4, 6]]) - y = [1, 2, 3] - sample_weight, metadata = [1.0, 1.0, 1.0], "a" - - est = Estimator( - estimator=BaseEst(registry=_Registry()).set_fit_request(**{prop: True}) - ) - - est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata}) - - for estimator in est.estimators_: - if prop == "sample_weight": - kwargs = {prop: sample_weight} - else: - kwargs = {prop: metadata} - registry = estimator.registry - assert len(registry) - for sub_est in registry: - # When sample weight is passed in, the sub-estimators use - # bootstrapping to sample from the dataset, so we can't check - # that sample weight is the same within all sub-estimators. - # - # Note: We can check all other metadata is the same though. - if prop != "sample_weight": - check_recorded_metadata( - obj=sub_est, - method="fit", - **kwargs, - ) From 926e5ee9c0a1fabc427454b4d9d69d3b22e114f0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 20 Feb 2024 10:47:22 -0500 Subject: [PATCH 18/22] Remove diff Signed-off-by: Adam Li --- .../_middle_term_computer.pyx.tp | 7 ++++++- .../_pairwise_distances_reduction/_radius_neighbors.pyx.tp | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index 1fca2d674720c..e81e6fa20e877 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -14,7 +14,6 @@ implementation_specific_values = [ }} from libcpp.vector cimport vector -from libcpp.algorithm cimport fill from ...utils._cython_blas cimport ( BLAS_Order, @@ -29,6 +28,12 @@ from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t import numpy as np from scipy.sparse import issparse, csr_matrix +# TODO: change for `libcpp.algorithm.fill` once Cython 3 is used +# Introduction in Cython: +# +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L50 #noqa +cdef extern from "" namespace "std" nogil: + void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa cdef void _middle_term_sparse_sparse_64( const float64_t[:] X_data, diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp index cd25b010cc81b..8695baad172e0 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -4,7 +4,6 @@ import warnings from libcpp.memory cimport shared_ptr, make_shared from libcpp.vector cimport vector -from libcpp.algorithm cimport move from cython cimport final from cython.operator cimport dereference as deref from cython.parallel cimport parallel, prange @@ -20,6 +19,12 @@ from ...utils.fixes import threadpool_limits cnp.import_array() +# TODO: change for `libcpp.algorithm.move` once Cython 3 is used +# Introduction in Cython: +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa +cdef extern from "" namespace "std" nogil: + OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa + ###################### cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( From 312be7186ca39943f07033e1cccb5c0dddbd9bef Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 20 Feb 2024 10:47:58 -0500 Subject: [PATCH 19/22] Remove diff Signed-off-by: Adam Li --- .../_middle_term_computer.pyx.tp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp index e81e6fa20e877..bfe465394c4d2 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp +++ b/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -25,9 +25,6 @@ from ...utils._cython_blas cimport ( ) from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t -import numpy as np -from scipy.sparse import issparse, csr_matrix - # TODO: change for `libcpp.algorithm.fill` once Cython 3 is used # Introduction in Cython: # @@ -35,6 +32,10 @@ from scipy.sparse import issparse, csr_matrix cdef extern from "" namespace "std" nogil: void fill[Iter, T](Iter first, Iter last, const T& value) except + #noqa +import numpy as np +from scipy.sparse import issparse, csr_matrix + + cdef void _middle_term_sparse_sparse_64( const float64_t[:] X_data, const int32_t[:] X_indices, From 5dc9a3d8ffff18a30e8f7ec1bfa345be53a36494 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 21 Feb 2024 09:05:43 -0500 Subject: [PATCH 20/22] Update v1.5.rst Co-authored-by: Omar Salman --- doc/whats_new/v1.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 911d72b7a0e01..66394288e60ce 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -43,7 +43,7 @@ more details. its `fit` method. :pr:`28187` by :user:`Stefanie Senger `. - |Feature| :class:`ensemble.BaggingClassifier` and :class:`ensemble.BaggingRegressor` - now supports metadata routing. The fit methods now + now support metadata routing. The fit methods now accepts ``**fit_params`` which are passed to the underlying estimators via their `fit` methods. :pr:`28432` by :user:`Adam Li ` and :user:`Benjamin Bossan `. From 3a3d4441dbdbece0914b1c6c58eaf22c6bd42253 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 21 Feb 2024 09:05:52 -0500 Subject: [PATCH 21/22] Update v1.5.rst Co-authored-by: Omar Salman --- doc/whats_new/v1.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 66394288e60ce..8c6623bd7e359 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -44,7 +44,7 @@ more details. - |Feature| :class:`ensemble.BaggingClassifier` and :class:`ensemble.BaggingRegressor` now support metadata routing. The fit methods now - accepts ``**fit_params`` which are passed to the underlying estimators + accept ``**fit_params`` which are passed to the underlying estimators via their `fit` methods. :pr:`28432` by :user:`Adam Li ` and :user:`Benjamin Bossan `. From 01a64e4b235755448662840ebf68b3c569b69337 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 21 Feb 2024 09:06:11 -0500 Subject: [PATCH 22/22] Update _bagging.py Co-authored-by: Omar Salman --- sklearn/ensemble/_bagging.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 41205aa3e6d22..878e2ea809c01 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -156,10 +156,10 @@ def _parallel_build_estimators( # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata # routing can't be disabled. - # 1. If routing is not enabled, we will check if the base - # estimator supports sample_weight and use it if it does. - # 2. If routing is enabled, we will check if the routing supports sample + # 1. If routing is enabled, we will check if the routing supports sample # weight and use it if it does. + # 2. If routing is not enabled, we will check if the base + # estimator supports sample_weight and use it if it does. # Note: Row sampling can be achieved either through setting sample_weight or # by indexing. The former is more efficient. Therefore, use this method