scikit-learn · glemaitre · Sep 20, 2019 · Aug 18, 2019 · Aug 18, 2019 · Aug 18, 2019
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 # License: BSD 3 clause
 
 
+import numbers
 from warnings import catch_warnings, simplefilter, warn
 import threading
 
@@ -72,17 +73,56 @@ class calls the ``fit`` method of each sub-estimator on random samples
 MAX_INT = np.iinfo(np.int32).max
 
 
-def _generate_sample_indices(random_state, n_samples):
+def _get_n_samples_bootstrap(n_samples, max_samples):
+    """Get the number of samples in a bootstrap sample.
+
+    Parameters
+    ----------
+    n_samples : int
+        Number of samples in the dataset.
+    max_samples : int or float
+        The maximum number of samples to draw from the total available:
+            - if float, this indicates a fraction of the total;
+            - if int, this indicates the exact number of samples;
+            - if None, this indicates the total number of samples.
+
+    Returns
+    -------
+    n_samples_bootstrap : int
+        The total number of samples to draw for the bootstrap sample.
+    """
+    if max_samples is None:
+        return n_samples
+
+    if isinstance(max_samples, numbers.Integral):
 def _validate_size(size, n_samples, param_name): 
     if size <= 0 or (size != 
                      int(size) 
                      and size > 1): 
         raise ValueError('%s must be a positive integer ' 
                          'or a float between 0 and 1. Got %r' % 
                          (param_name, size)) 
     elif size > n_samples: 
         raise ValueError('%s must be no greater than the' 
                          ' number of samples (%d). Got %d' % 
                          (param_name, n_samples, size)) 
 def _validate_size(size, n_samples, param_name): 
     if size <= 0 or (size != 
                      int(size) 
                      and size > 1): 
         raise ValueError('%s must be a positive integer ' 
                          'or a float between 0 and 1. Got %r' % 
                          (param_name, size)) 
     elif size > n_samples: 
         raise ValueError('%s must be no greater than the' 
                          ' number of samples (%d). Got %d' % 
                          (param_name, n_samples, size)) 
+        if not (1 <= max_samples <= n_samples):
+            msg = "`max_samples` must be in range 1 to {} but got value {}"
+            raise ValueError(msg.format(n_samples, max_samples))
+        return max_samples
+
+    if isinstance(max_samples, numbers.Real):
+        if not (0 < max_samples < 1):
+            msg = "`max_samples` must be in range (0, 1) but got value {}"
+            raise ValueError(msg.format(max_samples))
+        return int(round(n_samples * max_samples))
+
+    msg = "`max_samples` should be int or float, but got type '{}'"
+    raise TypeError(msg.format(type(max_samples)))
+
+
+def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap):
     """Private function used to _parallel_build_trees function."""
+
     random_instance = check_random_state(random_state)
-    sample_indices = random_instance.randint(0, n_samples, n_samples)
+    sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)
 
     return sample_indices
 
 
-def _generate_unsampled_indices(random_state, n_samples):
+def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap):
     """Private function used to forest._set_oob_score function."""
-    sample_indices = _generate_sample_indices(random_state, n_samples)
+    sample_indices = _generate_sample_indices(random_state, n_samples,
+                                              n_samples_bootstrap)
     sample_counts = np.bincount(sample_indices, minlength=n_samples)
     unsampled_mask = sample_counts == 0
     indices_range = np.arange(n_samples)
@@ -92,7 +132,8 @@ def _generate_unsampled_indices(random_state, n_samples):
 
 
 def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
-                          verbose=0, class_weight=None):
+                          verbose=0, class_weight=None,
+                          n_samples_bootstrap=None):
     """Private function used to fit a single tree in parallel."""
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))
@@ -104,7 +145,8 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         else:
             curr_sample_weight = sample_weight.copy()
 
-        indices = _generate_sample_indices(tree.random_state, n_samples)
+        indices = _generate_sample_indices(tree.random_state, n_samples,
+                                           n_samples_bootstrap)
         sample_counts = np.bincount(indices, minlength=n_samples)
         curr_sample_weight *= sample_counts
 
@@ -140,7 +182,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 max_samples=None):
         super().__init__(
             base_estimator=base_estimator,
             n_estimators=n_estimators,
@@ -153,6 +196,7 @@ def __init__(self,
         self.verbose = verbose
         self.warm_start = warm_start
         self.class_weight = class_weight
+        self.max_samples = max_samples
 
     def apply(self, X):
         """Apply trees in the forest to X, return leaf indices.
@@ -277,6 +321,12 @@ def fit(self, X, y, sample_weight=None):
             else:
                 sample_weight = expanded_class_weight
 
+        # Get bootstrap sample size
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples=X.shape[0],
+            max_samples=self.max_samples
+        )
+
         # Check parameters
         self._validate_estimator()
 
@@ -320,7 +370,8 @@ def fit(self, X, y, sample_weight=None):
                              **_joblib_parallel_args(prefer='threads'))(
                 delayed(_parallel_build_trees)(
                     t, self, X, y, sample_weight, i, len(trees),
-                    verbose=self.verbose, class_weight=self.class_weight)
+                    verbose=self.verbose, class_weight=self.class_weight,
+                    n_samples_bootstrap=n_samples_bootstrap)
                 for i, t in enumerate(trees))
 
             # Collect newly grown trees
@@ -410,7 +461,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 class_weight=None):
+                 class_weight=None,
+                 max_samples=None):
         super().__init__(
             base_estimator,
             n_estimators=n_estimators,
@@ -421,7 +473,8 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            max_samples=max_samples)
 
     def _set_oob_score(self, X, y):
         """Compute out-of-bag score"""
@@ -435,9 +488,13 @@ def _set_oob_score(self, X, y):
         predictions = [np.zeros((n_samples, n_classes_[k]))
                        for k in range(self.n_outputs_)]
 
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples, self.max_samples
+        )
+
         for estimator in self.estimators_:
             unsampled_indices = _generate_unsampled_indices(
-                estimator.random_state, n_samples)
+                estimator.random_state, n_samples, n_samples_bootstrap)
             p_estimator = estimator.predict_proba(X[unsampled_indices, :],
                                                   check_input=False)
 
@@ -650,7 +707,8 @@ def __init__(self,
                  n_jobs=None,
                  random_state=None,
                  verbose=0,
-                 warm_start=False):
+                 warm_start=False,
+                 max_samples=None):
         super().__init__(
             base_estimator,
             n_estimators=n_estimators,
@@ -660,7 +718,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            max_samples=max_samples)
 
     def predict(self, X):
         """Predict regression target for X.
@@ -713,9 +772,13 @@ def _set_oob_score(self, X, y):
         predictions = np.zeros((n_samples, self.n_outputs_))
         n_predictions = np.zeros((n_samples, self.n_outputs_))
 
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples, self.max_samples
+        )
+
         for estimator in self.estimators_:
             unsampled_indices = _generate_unsampled_indices(
-                estimator.random_state, n_samples)
+                estimator.random_state, n_samples, n_samples_bootstrap)
             p_estimator = estimator.predict(
                 X[unsampled_indices, :], check_input=False)
 
@@ -923,6 +986,14 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_samples : int or float, default=None
+        The number of samples to draw from X to train each base estimator.
+            - If None (default), then draw `X.shape[0]` samples.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     base_estimator_ : DecisionTreeClassifier
@@ -1016,7 +1087,8 @@ def __init__(self,
                  verbose=0,
                  warm_start=False,
                  class_weight=None,
-                 ccp_alpha=0.0):
+                 ccp_alpha=0.0,
+                 max_samples=None):
         super().__init__(
             base_estimator=DecisionTreeClassifier(),
             n_estimators=n_estimators,
@@ -1031,7 +1103,8 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            max_samples=max_samples)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1198,6 +1271,14 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_samples : int or float, default=None
+        The number of samples to draw from X to train each base estimator.
+            - If None (default), then draw `X.shape[0]` samples.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     base_estimator_ : DecisionTreeRegressor
@@ -1285,7 +1366,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 ccp_alpha=0.0):
+                 ccp_alpha=0.0,
+                 max_samples=None):
         super().__init__(
             base_estimator=DecisionTreeRegressor(),
             n_estimators=n_estimators,
@@ -1299,7 +1381,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            max_samples=max_samples)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1484,6 +1567,14 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_samples : int or float, default=None
+        The number of samples to draw from X to train each base estimator.
+            - If None (default), then draw `X.shape[0]` samples.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     base_estimator_ : ExtraTreeClassifier
@@ -1557,7 +1648,8 @@ def __init__(self,
                  verbose=0,
                  warm_start=False,
                  class_weight=None,
-                 ccp_alpha=0.0):
+                 ccp_alpha=0.0,
+                 max_samples=None):
         super().__init__(
             base_estimator=ExtraTreeClassifier(),
             n_estimators=n_estimators,
@@ -1572,7 +1664,8 @@ def __init__(self,
             random_state=random_state,
             verbose=verbose,
             warm_start=warm_start,
-            class_weight=class_weight)
+            class_weight=class_weight,
+            max_samples=max_samples)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1736,6 +1829,14 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_samples : int or float, default=None
+        The number of samples to draw from X to train each base estimator.
+            - If None (default), then draw `X.shape[0]` samples.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     base_estimator_ : ExtraTreeRegressor
@@ -1796,7 +1897,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 ccp_alpha=0.0):
+                 ccp_alpha=0.0,
+                 max_samples=None):
         super().__init__(
             base_estimator=ExtraTreeRegressor(),
             n_estimators=n_estimators,
@@ -1810,7 +1912,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            max_samples=max_samples)
 
         self.criterion = criterion
         self.max_depth = max_depth
@@ -1951,6 +2054,14 @@ class RandomTreesEmbedding(BaseForest):
 
         .. versionadded:: 0.22
 
+    max_samples : int or float, default=None
+        The number of samples to draw from X to train each base estimator.
+            - If None (default), then draw `X.shape[0]` samples.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     estimators_ : list of DecisionTreeClassifier
@@ -1983,7 +2094,8 @@ def __init__(self,
                  random_state=None,
                  verbose=0,
                  warm_start=False,
-                 ccp_alpha=0.0):
+                 ccp_alpha=0.0,
+                 max_samples=None):
         super().__init__(
             base_estimator=ExtraTreeRegressor(),
             n_estimators=n_estimators,
@@ -1997,7 +2109,8 @@ def __init__(self,
             n_jobs=n_jobs,
             random_state=random_state,
             verbose=verbose,
-            warm_start=warm_start)
+            warm_start=warm_start,
+            max_samples=max_samples)
 
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split