Skip to content

Commit

Permalink
FIX raise error for max_samples if no bootstrap & optimize forest tes…
Browse files Browse the repository at this point in the history
…ts (#21295)

* FIX raise error for max_samples if no bootstrap

* EHN move check position by suggestion

* FIX add bootstrap parameter to tests

* FIX resolve test error & DOC add log

* FIX add test coverage for bootstrap check

* DOC optimize error message

* FIX resolve test error

* ENH restrict sample bootstrap

* ENH optimize bootstrap conditions
  • Loading branch information
PSSF23 committed Nov 25, 2021
1 parent c040ab1 commit f96ce58
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 9 deletions.
10 changes: 10 additions & 0 deletions doc/whats_new/v1.1.rst
Expand Up @@ -117,6 +117,16 @@ Changelog
(which behaves like `'arbitrary-variance'`) to `'unit-variance'` in version 1.3.
:pr:`19490` by :user:`Facundo Ferrin <fferrin>` and :user:`Julien Jerphanion <jjerphan>`

:mod:`sklearn.ensemble`
.......................

- |Fix| :class:`ensemble.RandomForestClassifier`,
:class:`ensemble.RandomForestRegressor`,
:class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`,
and :class:`ensemble.RandomTreesEmbedding` now raise a ``ValueError`` when
``bootstrap=False`` and ``max_samples`` is not ``None``.
:pr:`21295` :user:`Haoyin Xu <PSSF23>`.

:mod:`sklearn.impute`
.....................

Expand Down
16 changes: 12 additions & 4 deletions sklearn/ensemble/_forest.py
Expand Up @@ -375,10 +375,18 @@ def fit(self, X, y, sample_weight=None):
else:
sample_weight = expanded_class_weight

# Get bootstrap sample size
n_samples_bootstrap = _get_n_samples_bootstrap(
n_samples=X.shape[0], max_samples=self.max_samples
)
if not self.bootstrap and self.max_samples is not None:
raise ValueError(
"`max_sample` cannot be set if `bootstrap=False`. "
"Either switch to `bootstrap=True` or set "
"`max_sample=None`."
)
elif self.bootstrap:
n_samples_bootstrap = _get_n_samples_bootstrap(
n_samples=X.shape[0], max_samples=self.max_samples
)
else:
n_samples_bootstrap = None

# Check parameters
self._validate_estimator()
Expand Down
31 changes: 26 additions & 5 deletions sklearn/ensemble/tests/test_forest.py
Expand Up @@ -1613,6 +1613,19 @@ def test_forest_degenerate_feature_importances():
assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64))


@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
def test_max_samples_bootstrap(name):
# Check invalid `max_samples` values
est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=False, max_samples=0.5)
err_msg = (
r"`max_sample` cannot be set if `bootstrap=False`. "
r"Either switch to `bootstrap=True` or set "
r"`max_sample=None`."
)
with pytest.raises(ValueError, match=err_msg):
est.fit(X, y)


@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
@pytest.mark.parametrize(
"max_samples, exc_type, exc_msg",
Expand Down Expand Up @@ -1660,7 +1673,7 @@ def test_forest_degenerate_feature_importances():
)
def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg):
# Check invalid `max_samples` values
est = FOREST_CLASSIFIERS_REGRESSORS[name](max_samples=max_samples)
est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=max_samples)
with pytest.raises(exc_type, match=exc_msg):
est.fit(X, y)

Expand All @@ -1671,10 +1684,14 @@ def test_max_samples_boundary_regressors(name):
X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0
)

ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0)
ms_1_model = FOREST_REGRESSORS[name](
bootstrap=True, max_samples=1.0, random_state=0
)
ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test)

ms_None_model = FOREST_REGRESSORS[name](max_samples=None, random_state=0)
ms_None_model = FOREST_REGRESSORS[name](
bootstrap=True, max_samples=None, random_state=0
)
ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test)

ms_1_ms = mean_squared_error(ms_1_predict, y_test)
Expand All @@ -1689,10 +1706,14 @@ def test_max_samples_boundary_classifiers(name):
X_large, y_large, random_state=0, stratify=y_large
)

ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0)
ms_1_model = FOREST_CLASSIFIERS[name](
bootstrap=True, max_samples=1.0, random_state=0
)
ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test)

ms_None_model = FOREST_CLASSIFIERS[name](max_samples=None, random_state=0)
ms_None_model = FOREST_CLASSIFIERS[name](
bootstrap=True, max_samples=None, random_state=0
)
ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test)

np.testing.assert_allclose(ms_1_proba, ms_None_proba)
Expand Down

0 comments on commit f96ce58

Please sign in to comment.