From 7ca3c47d8f61ab886948a41492781b4d22111cc7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 19 Aug 2020 11:42:25 -0400 Subject: [PATCH 1/3] remove duplicated call --- sklearn/tree/_criterion.pxd | 4 +++- sklearn/tree/_criterion.pyx | 47 ++++++++++++------------------------- sklearn/tree/_splitter.pyx | 16 ++++++++----- 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index e4a7e15ce16c1..2332cb598daef 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -62,7 +62,9 @@ cdef class Criterion: cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil cdef void node_value(self, double* dest) nogil - cdef double impurity_improvement(self, double impurity) nogil + cdef double impurity_improvement(self, double impurity_parent, + double impurity_left, + double impurity_right) nogil cdef double proxy_impurity_improvement(self) nogil cdef class ClassificationCriterion(Criterion): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index d11f67854731e..0ef2a59f1b0f1 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -171,7 +171,9 @@ cdef class Criterion: return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) - cdef double impurity_improvement(self, double impurity) nogil: + cdef double impurity_improvement(self, double impurity_parent, + double impurity_left, + double impurity_right) nogil: """Compute the improvement in impurity This method computes the improvement in impurity when a split occurs. @@ -186,24 +188,25 @@ cdef class Criterion: Parameters ---------- - impurity : double - The initial impurity of the node before the split + impurity_parent : double + The initial impurity of the parent node before the split + + impurity_parent : double + The impurity of the left child + + impurity_parent : double + The impurity of the right child Return ------ double : improvement in impurity after the split occurs """ - cdef double impurity_left - cdef double impurity_right - - self.children_impurity(&impurity_left, &impurity_right) - return ((self.weighted_n_node_samples / self.weighted_n_samples) * - (impurity - (self.weighted_n_right / - self.weighted_n_node_samples * impurity_right) - - (self.weighted_n_left / - self.weighted_n_node_samples * impurity_left))) + (impurity_parent - (self.weighted_n_right / + self.weighted_n_node_samples * impurity_right) + - (self.weighted_n_left / + self.weighted_n_node_samples * impurity_left))) cdef class ClassificationCriterion(Criterion): @@ -1305,23 +1308,3 @@ cdef class FriedmanMSE(MSE): self.weighted_n_left * total_sum_right) return diff * diff / (self.weighted_n_left * self.weighted_n_right) - - cdef double impurity_improvement(self, double impurity) nogil: - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right - - cdef double total_sum_left = 0.0 - cdef double total_sum_right = 0.0 - - cdef SIZE_t k - cdef double diff = 0.0 - - for k in range(self.n_outputs): - total_sum_left += sum_left[k] - total_sum_right += sum_right[k] - - diff = (self.weighted_n_right * total_sum_left - - self.weighted_n_left * total_sum_right) / self.n_outputs - - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * - self.weighted_n_node_samples)) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 15b69487990d8..d7b5e81c7b8f7 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -434,9 +434,10 @@ cdef class BestSplitter(BaseDenseSplitter): self.criterion.reset() self.criterion.update(best.pos) - best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling @@ -745,9 +746,10 @@ cdef class RandomSplitter(BaseDenseSplitter): self.criterion.reset() self.criterion.update(best.pos) - best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling @@ -1293,9 +1295,10 @@ cdef class BestSparseSplitter(BaseSparseSplitter): self.criterion.reset() self.criterion.update(best.pos) - best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling @@ -1504,10 +1507,10 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement - current.improvement = self.criterion.impurity_improvement(impurity) - self.criterion.children_impurity(¤t.impurity_left, ¤t.impurity_right) + current.improvement = self.criterion.impurity_improvement( + impurity, current.impurity_left, current.impurity_right) best = current # Reorganize into samples[start:best.pos] + samples[best.pos:end] @@ -1521,9 +1524,10 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): self.criterion.reset() self.criterion.update(best.pos) - best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling From f5a75814f7154e525d66ca641de051f16a2d25e2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 19 Aug 2020 15:23:22 -0400 Subject: [PATCH 2/3] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Juan Carlos Alfaro JimĂ©nez --- sklearn/tree/_criterion.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0ef2a59f1b0f1..b0b3eb84ca3b1 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -191,10 +191,10 @@ cdef class Criterion: impurity_parent : double The initial impurity of the parent node before the split - impurity_parent : double + impurity_left : double The impurity of the left child - impurity_parent : double + impurity_right : double The impurity of the right child Return From cf33df74b705c6ab58735bbfe75ea70882ae4e5e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 20 Aug 2020 09:30:26 -0400 Subject: [PATCH 3/3] putback friedman mse overrided method --- sklearn/tree/_criterion.pyx | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0ef2a59f1b0f1..314fcac283dbc 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1308,3 +1308,25 @@ cdef class FriedmanMSE(MSE): self.weighted_n_left * total_sum_right) return diff * diff / (self.weighted_n_left * self.weighted_n_right) + + cdef double impurity_improvement(self, double impurity_parent, double + impurity_left, double impurity_right) nogil: + # Note: none of the arguments are used here + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + + cdef double total_sum_left = 0.0 + cdef double total_sum_right = 0.0 + + cdef SIZE_t k + cdef double diff = 0.0 + + for k in range(self.n_outputs): + total_sum_left += sum_left[k] + total_sum_right += sum_right[k] + + diff = (self.weighted_n_right * total_sum_left - + self.weighted_n_left * total_sum_right) / self.n_outputs + + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + self.weighted_n_node_samples))