Permalink
Browse files

[MRG] EHN: Change default n_estimators to 100 for random forest (#11542)

<!--
Thanks for contributing a pull request! Please ensure you have taken a look at
the contribution guidelines: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#pull-request-checklist
-->

#### Reference Issues/PRs
<!--
Example: Fixes #1234. See also #3456.
Please use keywords (e.g., Fixes) to create link to the issues or pull requests
you resolved, so that they will automatically be closed when your pull request
is merged. See https://github.com/blog/1506-closing-issues-via-pull-requests
-->
Fixes #11128.

#### What does this implement/fix? Explain your changes.
Issues deprecation warning message for the default n_estimators parameter for the forest classifiers. Test added for the warning message when the default parameter is used.

#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:
http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->
  • Loading branch information...
annaayzenshtat authored and amueller committed Jul 17, 2018
1 parent a496491 commit 2242c59fc890455bd121e4a03375c5632f31ef93
@@ -22,7 +22,6 @@ Highlights
We have tried to improve our support for common data-science use-cases
including missing values, categorical variables, heterogeneous data, and
features/targets with unusual distributions.
Missing values in features, represented by NaNs, are now accepted in
column-wise preprocessing such as scalers. Each feature is fitted disregarding
NaNs, and data containing NaNs can be transformed. The new :mod:`impute`
@@ -734,6 +733,15 @@ Datasets
API changes summary
-------------------
Classifiers and regressors
- The default value of the ``n_estimators`` parameter of
:class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`,
:class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`,
and :class:`ensemble.RandomTreesEmbedding` will change from 10 in version 0.20
to 100 in 0.22. A FutureWarning is raised when the default value is used.
:issue:`11542` by :user:`Anna Ayzenshtat <annaayzenshtat>`.
Linear, kernelized and related models
- Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the
@@ -285,7 +285,7 @@ def plot_benchmark_throughput(throughputs, configuration):
'complexity_label': 'non-zero coefficients',
'complexity_computer': lambda clf: np.count_nonzero(clf.coef_)},
{'name': 'RandomForest',
'instance': RandomForestRegressor(),
'instance': RandomForestRegressor(n_estimators=100),
'complexity_label': 'estimators',
'complexity_computer': lambda clf: clf.n_estimators},
{'name': 'SVR',
@@ -45,15 +45,18 @@
# error trajectory during training.
ensemble_clfs = [
("RandomForestClassifier, max_features='sqrt'",
RandomForestClassifier(warm_start=True, oob_score=True,
RandomForestClassifier(n_estimators=100,
warm_start=True, oob_score=True,
max_features="sqrt",
random_state=RANDOM_STATE)),
("RandomForestClassifier, max_features='log2'",
RandomForestClassifier(warm_start=True, max_features='log2',
RandomForestClassifier(n_estimators=100,
warm_start=True, max_features='log2',
oob_score=True,
random_state=RANDOM_STATE)),
("RandomForestClassifier, max_features=None",
RandomForestClassifier(warm_start=True, max_features=None,
RandomForestClassifier(n_estimators=100,
warm_start=True, max_features=None,
oob_score=True,
random_state=RANDOM_STATE))
]
@@ -43,11 +43,13 @@
X, y, train_size=400, test_size=200, random_state=4)
max_depth = 30
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth,
regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100,
max_depth=max_depth,
random_state=0))
regr_multirf.fit(X_train, y_train)
regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=2)
regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth,
random_state=2)
regr_rf.fit(X_train, y_train)
# Predict on new data
@@ -30,7 +30,7 @@
from sklearn.ensemble import VotingClassifier
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
clf2 = RandomForestClassifier(n_estimators=100, random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])
@@ -135,7 +135,7 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)):
@abstractmethod
def __init__(self,
base_estimator,
n_estimators=10,
n_estimators=100,
estimator_params=tuple(),
bootstrap=False,
oob_score=False,
@@ -242,6 +242,12 @@ def fit(self, X, y, sample_weight=None):
-------
self : object
"""
if self.n_estimators == 'warn':
warnings.warn("The default value of n_estimators will change from "
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
self.n_estimators = 10
# Validate or convert input data
X = check_array(X, accept_sparse="csc", dtype=DTYPE)
y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
@@ -400,7 +406,7 @@ class ForestClassifier(six.with_metaclass(ABCMeta, BaseForest,
@abstractmethod
def __init__(self,
base_estimator,
n_estimators=10,
n_estimators=100,
estimator_params=tuple(),
bootstrap=False,
oob_score=False,
@@ -409,7 +415,6 @@ def __init__(self,
verbose=0,
warm_start=False,
class_weight=None):
super(ForestClassifier, self).__init__(
base_estimator,
n_estimators=n_estimators,
@@ -640,7 +645,7 @@ class ForestRegressor(six.with_metaclass(ABCMeta, BaseForest, RegressorMixin)):
@abstractmethod
def __init__(self,
base_estimator,
n_estimators=10,
n_estimators=100,
estimator_params=tuple(),
bootstrap=False,
oob_score=False,
@@ -760,6 +765,10 @@ class RandomForestClassifier(ForestClassifier):
n_estimators : integer, optional (default=10)
The number of trees in the forest.
.. versionchanged:: 0.20
The default value of ``n_estimators`` will change from 10 in
version 0.20 to 100 in version 0.22.
criterion : string, optional (default="gini")
The function to measure the quality of a split. Supported criteria are
"gini" for the Gini impurity and "entropy" for the information gain.
@@ -973,7 +982,7 @@ class labels (multi-output problem).
DecisionTreeClassifier, ExtraTreesClassifier
"""
def __init__(self,
n_estimators=10,
n_estimators='warn',
criterion="gini",
max_depth=None,
min_samples_split=2,
@@ -1034,6 +1043,10 @@ class RandomForestRegressor(ForestRegressor):
n_estimators : integer, optional (default=10)
The number of trees in the forest.
.. versionchanged:: 0.20
The default value of ``n_estimators`` will change from 10 in
version 0.20 to 100 in version 0.22.
criterion : string, optional (default="mse")
The function to measure the quality of a split. Supported criteria
are "mse" for the mean squared error, which is equal to variance
@@ -1213,7 +1226,7 @@ class RandomForestRegressor(ForestRegressor):
DecisionTreeRegressor, ExtraTreesRegressor
"""
def __init__(self,
n_estimators=10,
n_estimators='warn',
criterion="mse",
max_depth=None,
min_samples_split=2,
@@ -1270,6 +1283,10 @@ class ExtraTreesClassifier(ForestClassifier):
n_estimators : integer, optional (default=10)
The number of trees in the forest.
.. versionchanged:: 0.20
The default value of ``n_estimators`` will change from 10 in
version 0.20 to 100 in version 0.22.
criterion : string, optional (default="gini")
The function to measure the quality of a split. Supported criteria are
"gini" for the Gini impurity and "entropy" for the information gain.
@@ -1456,7 +1473,7 @@ class labels (multi-output problem).
splits.
"""
def __init__(self,
n_estimators=10,
n_estimators='warn',
criterion="gini",
max_depth=None,
min_samples_split=2,
@@ -1515,6 +1532,10 @@ class ExtraTreesRegressor(ForestRegressor):
n_estimators : integer, optional (default=10)
The number of trees in the forest.
.. versionchanged:: 0.20
The default value of ``n_estimators`` will change from 10 in
version 0.20 to 100 in version 0.22.
criterion : string, optional (default="mse")
The function to measure the quality of a split. Supported criteria
are "mse" for the mean squared error, which is equal to variance
@@ -1668,7 +1689,7 @@ class ExtraTreesRegressor(ForestRegressor):
RandomForestRegressor: Ensemble regressor using trees with optimal splits.
"""
def __init__(self,
n_estimators=10,
n_estimators='warn',
criterion="mse",
max_depth=None,
min_samples_split=2,
@@ -1730,6 +1751,10 @@ class RandomTreesEmbedding(BaseForest):
n_estimators : integer, optional (default=10)
Number of trees in the forest.
.. versionchanged:: 0.20
The default value of ``n_estimators`` will change from 10 in
version 0.20 to 100 in version 0.22.
max_depth : integer, optional (default=5)
The maximum depth of each tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
@@ -1832,7 +1857,7 @@ class RandomTreesEmbedding(BaseForest):
"""
def __init__(self,
n_estimators=10,
n_estimators='warn',
max_depth=5,
min_samples_split=2,
min_samples_leaf=1,
@@ -31,6 +31,7 @@
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_no_warnings
from sklearn.utils.testing import ignore_warnings
from sklearn import datasets
@@ -186,6 +187,7 @@ def check_regressor_attributes(name):
assert_false(hasattr(r, "n_classes_"))
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_REGRESSORS)
def test_regressor_attributes(name):
check_regressor_attributes(name)
@@ -432,6 +434,7 @@ def check_oob_score_raise_error(name):
bootstrap=False).fit, X, y)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_ESTIMATORS)
def test_oob_score_raise_error(name):
check_oob_score_raise_error(name)
@@ -489,6 +492,7 @@ def check_pickle(name, X, y):
assert_equal(score, score2)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS)
def test_pickle(name):
if name in FOREST_CLASSIFIERS:
@@ -526,6 +530,7 @@ def check_multioutput(name):
assert_equal(log_proba[1].shape, (4, 4))
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS)
def test_multioutput(name):
check_multioutput(name)
@@ -549,6 +554,7 @@ def check_classes_shape(name):
assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
def test_classes_shape(name):
check_classes_shape(name)
@@ -738,6 +744,7 @@ def check_min_samples_split(name):
"Failed with {0}".format(name))
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_ESTIMATORS)
def test_min_samples_split(name):
check_min_samples_split(name)
@@ -775,6 +782,7 @@ def check_min_samples_leaf(name):
"Failed with {0}".format(name))
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_ESTIMATORS)
def test_min_samples_leaf(name):
check_min_samples_leaf(name)
@@ -842,6 +850,7 @@ def check_sparse_input(name, X, X_sparse, y):
dense.fit_transform(X).toarray())
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@pytest.mark.parametrize('sparse_matrix',
(csr_matrix, csc_matrix, coo_matrix))
@@ -899,6 +908,7 @@ def check_memory_layout(name, dtype):
assert_array_almost_equal(est.fit(X, y).predict(X), y)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS)
@pytest.mark.parametrize('dtype', (np.float64, np.float32))
def test_memory_layout(name, dtype):
@@ -977,6 +987,7 @@ def check_class_weights(name):
clf.fit(iris.data, iris.target, sample_weight=sample_weight)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
def test_class_weights(name):
check_class_weights(name)
@@ -996,6 +1007,7 @@ def check_class_weight_balanced_and_bootstrap_multi_output(name):
clf.fit(X, _y)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
def test_class_weight_balanced_and_bootstrap_multi_output(name):
check_class_weight_balanced_and_bootstrap_multi_output(name)
@@ -1026,6 +1038,7 @@ def check_class_weight_errors(name):
assert_raises(ValueError, clf.fit, X, _y)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
def test_class_weight_errors(name):
check_class_weight_errors(name)
@@ -1163,6 +1176,7 @@ def test_warm_start_oob(name):
check_warm_start_oob(name)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
def test_dtype_convert(n_classes=15):
classifier = RandomForestClassifier(random_state=0, bootstrap=False)
@@ -1201,6 +1215,7 @@ def test_decision_path(name):
check_decision_path(name)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
def test_min_impurity_split():
# Test if min_impurity_split of base estimators is set
# Regression test for #8006
@@ -1216,6 +1231,7 @@ def test_min_impurity_split():
assert_equal(tree.min_impurity_split, 0.1)
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
def test_min_impurity_decrease():
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
@@ -1228,3 +1244,21 @@ def test_min_impurity_decrease():
# Simply check if the parameter is passed on correctly. Tree tests
# will suffice for the actual working of this param
assert_equal(tree.min_impurity_decrease, 0.1)
@pytest.mark.parametrize('forest',
[RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor,
RandomTreesEmbedding])
def test_nestimators_future_warning(forest):
# FIXME: to be removed 0.22
# When n_estimators default value is used
msg_future = ("The default value of n_estimators will change from "
"10 in version 0.20 to 100 in 0.22.")
est = forest()
est = assert_warns_message(FutureWarning, msg_future, est.fit, X, y)
# When n_estimators is a valid value not equal to the default
est = forest(n_estimators=100)
est = assert_no_warnings(est.fit, X, y)
Oops, something went wrong.

0 comments on commit 2242c59

Please sign in to comment.