FIX raise error for max_samples if no bootstrap & optimize forest tes…

…ts (#21295) * FIX raise error for max_samples if no bootstrap * EHN move check position by suggestion * FIX add bootstrap parameter to tests * FIX resolve test error & DOC add log * FIX add test coverage for bootstrap check * DOC optimize error message * FIX resolve test error * ENH restrict sample bootstrap * ENH optimize bootstrap conditions
scikit-learn · Nov 25, 2021 · f96ce58 · f96ce58
1 parent c040ab1
commit f96ce58
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 9 deletions.
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -117,6 +117,16 @@ Changelog
   (which behaves like `'arbitrary-variance'`) to `'unit-variance'` in version 1.3.
   :pr:`19490` by :user:`Facundo Ferrin <fferrin>` and :user:`Julien Jerphanion <jjerphan>`
 
+:mod:`sklearn.ensemble`
+.......................
+
+- |Fix| :class:`ensemble.RandomForestClassifier`,
+  :class:`ensemble.RandomForestRegressor`,
+  :class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`,
+  and :class:`ensemble.RandomTreesEmbedding` now raise a ``ValueError`` when
+  ``bootstrap=False`` and ``max_samples`` is not ``None``.
+  :pr:`21295` :user:`Haoyin Xu <PSSF23>`.
+
 :mod:`sklearn.impute`
 .....................
 

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
@@ -375,10 +375,18 @@ def fit(self, X, y, sample_weight=None):
             else:
                 sample_weight = expanded_class_weight
 
-        # Get bootstrap sample size
-        n_samples_bootstrap = _get_n_samples_bootstrap(
-            n_samples=X.shape[0], max_samples=self.max_samples
-        )
+        if not self.bootstrap and self.max_samples is not None:
+            raise ValueError(
+                "`max_sample` cannot be set if `bootstrap=False`. "
+                "Either switch to `bootstrap=True` or set "
+                "`max_sample=None`."
+            )
+        elif self.bootstrap:
+            n_samples_bootstrap = _get_n_samples_bootstrap(
+                n_samples=X.shape[0], max_samples=self.max_samples
+            )
+        else:
+            n_samples_bootstrap = None
 
         # Check parameters
         self._validate_estimator()

diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -1613,6 +1613,19 @@ def test_forest_degenerate_feature_importances():
     assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64))
 
 
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_max_samples_bootstrap(name):
+    # Check invalid `max_samples` values
+    est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=False, max_samples=0.5)
+    err_msg = (
+        r"`max_sample` cannot be set if `bootstrap=False`. "
+        r"Either switch to `bootstrap=True` or set "
+        r"`max_sample=None`."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
 @pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
 @pytest.mark.parametrize(
     "max_samples, exc_type, exc_msg",
@@ -1660,7 +1673,7 @@ def test_forest_degenerate_feature_importances():
 )
 def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg):
     # Check invalid `max_samples` values
-    est = FOREST_CLASSIFIERS_REGRESSORS[name](max_samples=max_samples)
+    est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=max_samples)
     with pytest.raises(exc_type, match=exc_msg):
         est.fit(X, y)
 
@@ -1671,10 +1684,14 @@ def test_max_samples_boundary_regressors(name):
         X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0
     )
 
-    ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0)
+    ms_1_model = FOREST_REGRESSORS[name](
+        bootstrap=True, max_samples=1.0, random_state=0
+    )
     ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test)
 
-    ms_None_model = FOREST_REGRESSORS[name](max_samples=None, random_state=0)
+    ms_None_model = FOREST_REGRESSORS[name](
+        bootstrap=True, max_samples=None, random_state=0
+    )
     ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test)
 
     ms_1_ms = mean_squared_error(ms_1_predict, y_test)
@@ -1689,10 +1706,14 @@ def test_max_samples_boundary_classifiers(name):
         X_large, y_large, random_state=0, stratify=y_large
     )
 
-    ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0)
+    ms_1_model = FOREST_CLASSIFIERS[name](
+        bootstrap=True, max_samples=1.0, random_state=0
+    )
     ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test)
 
-    ms_None_model = FOREST_CLASSIFIERS[name](max_samples=None, random_state=0)
+    ms_None_model = FOREST_CLASSIFIERS[name](
+        bootstrap=True, max_samples=None, random_state=0
+    )
     ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test)
 
     np.testing.assert_allclose(ms_1_proba, ms_None_proba)