From 8f715dd29de2d0a20f94da38a4d9257d52e62b50 Mon Sep 17 00:00:00 2001 From: yelite Date: Thu, 10 Jul 2014 14:48:20 +0800 Subject: [PATCH] In tree, min_samples_split and min_samples_leaf now accept float number as percentage. --- sklearn/tree/tree.py | 51 +++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index ce2b0c3f4126a..eff1609312985 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -244,8 +244,17 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, else: min_weight_leaf = 0. + if isinstance(self.min_samples_leaf, float): + min_samples_leaf = int(np.ceil(self.min_samples_leaf * n_samples)) + else: + min_samples_leaf = self.min_samples_leaf + # Set min_samples_split sensibly - min_samples_split = max(self.min_samples_split, + if isinstance(self.min_samples_split, float): + min_samples_split = int(np.ceil(self.min_samples_split * n_samples)) + else: + min_samples_split = self.min_samples_split + min_samples_split = max(min_samples_split, 2 * self.min_samples_leaf) # Build tree @@ -261,7 +270,7 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, - self.min_samples_leaf, + min_samples_leaf, min_weight_leaf, random_state) @@ -270,12 +279,12 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True, # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, - self.min_samples_leaf, + min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, - self.min_samples_leaf, + min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) @@ -402,11 +411,19 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin): min_samples_split samples. Ignored if ``max_samples_leaf`` is not None. - min_samples_split : int, optional (default=2) - The minimum number of samples required to split an internal node. + min_samples_split : int, float, optional (default=2) + The minimum number of samples required to split an internal node: + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a percentage and + `int(min_samples_split * n_samples)` are the minimum + number of samples for each split. - min_samples_leaf : int, optional (default=1) - The minimum number of samples required to be at a leaf node. + min_samples_leaf : int, float, optional (default=1) + The minimum number of samples required to be at a leaf node: + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a percentage and + `int(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the input samples required to be at a @@ -625,11 +642,19 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): min_samples_split samples. Ignored if ``max_samples_leaf`` is not None. - min_samples_split : int, optional (default=2) - The minimum number of samples required to split an internal node. - - min_samples_leaf : int, optional (default=1) - The minimum number of samples required to be at a leaf node. + min_samples_split : int, float, optional (default=2) + The minimum number of samples required to split an internal node: + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a percentage and + `int(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + min_samples_leaf : int, float, optional (default=1) + The minimum number of samples required to be at a leaf node: + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a percentage and + `int(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. min_weight_fraction_leaf : float, optional (default=0.) The minimum weighted fraction of the input samples required to be at a