scikit-learn · NicolasHug · Aug 20, 2019 · Dec 28, 2018 · Dec 28, 2018 · Dec 28, 2018
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -192,6 +192,11 @@ Support for Python 3.4 and below has been officially dropped.
   and :class:`tree.ExtraTreeRegressor`.
   :issue:`12300` by :user:`Adrin Jalali <adrinjalali>`.
 
+- |Feature| Adds minimal cost complexity pruning to
+  :class:`tree.DecisionTreeClassifier`, :class:`tree.DecisionTreeRegressor`,
+  :class:`tree.ExtraTreeClassifier`, and :class:`tree.ExtraTreeRegressor`.
+  :issue:`6557` by :user:`Thomas Fan <thomasjpfan>`.
-  :issue:`6557` by :user:`Thomas Fan <thomasjpfan>`.
+  :issue:`12887` by :user:`Thomas Fan <thomasjpfan>`.
-  :issue:`6557` by :user:`Thomas Fan <thomasjpfan>`.
+  :issue:`12887` by :user:`Thomas Fan <thomasjpfan>`.
+
 - |Fix| Fixed an issue with :class:`tree.BaseDecisionTree`
   and consequently all estimators based
   on it, including :class:`tree.DecisionTreeClassifier`,

diff --git a/examples/tree/plot_cost_complexity_pruning.py b/examples/tree/plot_cost_complexity_pruning.py
@@ -0,0 +1,67 @@
+r"""
+========================================================
+Post pruning decision trees with cost complexity pruning
+========================================================
+
+In this example, decision tree classifiers are trained with a post pruning
+technique called minimal cost complexity pruning. This technique is
+parameterized by the complexity parameter, :math:`\alpha`. Greater values of
+:math:`\alpha` will prune more of the tree, thus creating a smaller trees.
+"""
+
+###############################################################################
+# Train decision tree classifiers
+# -------------------------------
+# Train 40 decision tree classifiers with :math:`\alpha` from 0.00 to
+# 0.40.
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_breast_cancer
+from sklearn.tree import DecisionTreeClassifier
+
+X, y = load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+alphas = np.linspace(0, 0.04, 40)
+clfs = []
+for alpha in alphas:
+    clf = DecisionTreeClassifier(random_state=0, alpha=alpha)
+    clf.fit(X_train, y_train)
+    clfs.append(clf)
+
+###############################################################################
+# Plot training and test scores vs alpha
+# --------------------------------------
+# Calcuate and plot the the training scores and test accuracy scores
+# for our classifiers. With :math:`\alpha` equal to 0.0, the decision tree is
+# overfitting with a 1.0 training accuracy score. As the decision tree is
+# pruned the testing accuracy score increases up to a point and then decreases.
+import matplotlib.pyplot as plt
+
+train_scores = []
+test_scores = []
+for clf in clfs:
+    train_scores.append(clf.score(X_train, y_train))
+    test_scores.append(clf.score(X_test, y_test))
+
+fig, ax = plt.subplots()
+ax.set_xlabel("alpha")
+ax.set_ylabel("accuracy")
+ax.set_title("Accuracy vs alpha for training and testing sets")
+ax.plot(alphas, train_scores, label="train")
+ax.plot(alphas, test_scores, label="test")
+ax.legend()
+fig.show()
+
+###############################################################################
+# Plot total number of nodes vs alpha
+# -----------------------------------
+# Plot the total number of nodes for our classifiers. As :math:`\alpha`
+# increases, the number of nodes decreases.
+node_counts = [clf.tree_.node_count for clf in clfs]
+fig, ax = plt.subplots()
+ax.set_xlabel("alpha")
+ax.set_ylabel("number of nodes")
+ax.set_title("Number of nodes vs alpha")
+ax.plot(alphas, node_counts)
+fig.show()
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
@@ -103,3 +103,13 @@ cdef class TreeBuilder:
                 np.ndarray sample_weight=*,
                 np.ndarray X_idx_sorted=*)
     cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)
+
+
+# =============================================================================
+# Build Pruned Tree
+# =============================================================================
+
+cpdef build_pruned_tree(
+    Tree tree,
+    Tree orig_tree,
+    np.ndarray[np.npy_uint8, ndim=1] leaves_in_subtree)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
@@ -1132,3 +1132,101 @@ cdef class Tree:
         Py_INCREF(self)
         arr.base = <PyObject*> self
         return arr
+
+# =============================================================================
+# Build Pruned Tree
+# =============================================================================
+
+cpdef build_pruned_tree(
+    Tree tree,
+    Tree orig_tree,
+    np.ndarray[np.npy_uint8, ndim=1] leaves_in_subtree):
+    """Builds a pruned tree.
+
+    Builds a pruned tree from the original tree. The values and nodes from the
+    original tree are copied into the pruned tree.
+
+    Parameters
+    ----------
+    tree : Tree
+        Location to place the pruned tree
+    orig_tree : Tree
+        Original tree
+    leaves_in_subtree : numpy.ndarray, dtype=np.npy_uint8
+        Original node ids that are leaves in pruned tree
+    """
+
+    cdef SIZE_t capacity = np.sum(leaves_in_subtree)
+    tree._resize(capacity)
+
+    cdef SIZE_t orig_node_id
+    cdef SIZE_t new_node_id
+    cdef SIZE_t depth
+    cdef SIZE_t parent
+    cdef bint is_left
+    cdef bint is_leaf
+
+    # value_stride for original tree and new tree are the same
+    cdef SIZE_t value_stride = orig_tree.value_stride
+    cdef SIZE_t max_depth_seen = -1
+    cdef int rc = 0
+    cdef Node* node
+    cdef double* orig_value_ptr
+    cdef double* new_value_ptr
+
+    # Only uses the start, depth, parent, and is_left variables
+    cdef Stack stack = Stack(INITIAL_STACK_SIZE)
+    cdef StackRecord stack_record
+
+    with nogil:
+        # push root node onto stack
+        rc = stack.push(0, 0, 0, _TREE_UNDEFINED, 0, 0.0, 0)
+        if rc == -1:
+            with gil:
+                raise MemoryError()
+
+        while not stack.is_empty():
+            stack.pop(&stack_record)
+
+            orig_node_id = stack_record.start
+            depth = stack_record.depth
+            parent = stack_record.parent
+            is_left = stack_record.is_left
+
+            is_leaf = leaves_in_subtree[orig_node_id]
+            node = &orig_tree.nodes[orig_node_id]
+
+            new_node_id = tree._add_node(
+                parent, is_left, is_leaf, node.feature, node.threshold,
+                node.impurity, node.n_node_samples,
+                node.weighted_n_node_samples)
+
+            if new_node_id == <SIZE_t>(-1):
+                rc = -1
+                break
+
+            # copy value from original tree to new tree
+            orig_value_ptr = orig_tree.value + value_stride * orig_node_id
+            new_value_ptr = tree.value + value_stride * new_node_id
+            memcpy(new_value_ptr, orig_value_ptr, sizeof(double) * value_stride)
+
+            if not is_leaf:
+                # Push right child on stack
+                rc = stack.push(
+                    node.right_child, 0, depth + 1, new_node_id, 0, 0.0, 0)
+                if rc == -1:
+                    break
+
+                # push left child on stack
+                rc = stack.push(
+                    node.left_child, 0, depth + 1, new_node_id, 1, 0.0, 0)
+                if rc == -1:
+                    break
+
+            if depth > max_depth_seen:
+                max_depth_seen = depth
+
+        if rc >= 0:
+            tree.max_depth = max_depth_seen
+    if rc == -1:
+        raise MemoryError()
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
@@ -1828,3 +1828,71 @@ def test_empty_leaf_infinite_threshold():
         infinite_threshold = np.where(~np.isfinite(tree.tree_.threshold))[0]
         assert len(infinite_threshold) == 0
         assert len(empty_leaf) == 0
+
+
+@pytest.mark.parametrize("criterion", CLF_CRITERIONS)
+@pytest.mark.parametrize(
+    "dataset", set(DATASETS.keys()) - {"reg_small", "boston"})
+@pytest.mark.parametrize(
+    "tree_cls", [DecisionTreeClassifier, ExtraTreeClassifier])
+def test_prune_tree_clf_are_subtrees(criterion, dataset, tree_cls):
+    dataset = DATASETS[dataset]
+    X, y = dataset["X"], dataset["y"]
+    assert_pruning_creates_subtree(tree_cls, X, y)
+
+
+@pytest.mark.parametrize("criterion", REG_CRITERIONS)
+@pytest.mark.parametrize("dataset", DATASETS.keys())
+@pytest.mark.parametrize(
+    "tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor])
+def test_prune_tree_reg_are_subtrees(criterion, dataset, tree_cls):
+    dataset = DATASETS[dataset]
+    X, y = dataset["X"], dataset["y"]
+    assert_pruning_creates_subtree(tree_cls, X, y)
+
+
+def assert_pruning_creates_subtree(estimator_cls, X, y):
+    estimators = []
+    for alpha in np.linspace(0.0, 0.2, 11):
+        est = estimator_cls(
+            max_leaf_nodes=20, alpha=alpha, random_state=0).fit(X, y)
+        estimators.append(est)
+
+    for prev_est, next_est in zip(estimators[:-1], estimators[1:]):
-    for prev_est, next_est in zip(estimators[:-1], estimators[1:]):
+    for prev_est, next_est in zip(estimators, estimators[1:]):
-    for prev_est, next_est in zip(estimators[:-1], estimators[1:]):
+    for prev_est, next_est in zip(estimators, estimators[1:]):
+        assert_is_subtree(prev_est.tree_, next_est.tree_)
+
+
+def assert_is_subtree(tree, subtree):
+    assert tree.node_count >= subtree.node_count
+    assert tree.max_depth >= subtree.max_depth
+
+    tree_c_left = tree.children_left
+    tree_c_right = tree.children_right
+    subtree_c_left = subtree.children_left
+    subtree_c_right = subtree.children_right
+
+    stack = [(0, 0)]
+    while len(stack) > 0:
-    while len(stack) > 0:
+    while stack:
-    while len(stack) > 0:
+    while stack:
+        tree_n_idx, subtree_n_idx = stack.pop()
+        assert_array_almost_equal(
+            tree.value[tree_n_idx], subtree.value[subtree_n_idx])
+        assert_almost_equal(
+            tree.impurity[tree_n_idx], subtree.impurity[subtree_n_idx])
+        assert_almost_equal(
+            tree.n_node_samples[tree_n_idx],
+            subtree.n_node_samples[subtree_n_idx])
+        assert_almost_equal(
+            tree.weighted_n_node_samples[tree_n_idx],
+            subtree.weighted_n_node_samples[subtree_n_idx])
+
+        if (subtree_c_left[subtree_n_idx] == subtree_c_right[subtree_n_idx]):
+            # is a leaf
+            assert_almost_equal(-2, subtree.threshold[subtree_n_idx])
+        else:
+            # not a leaf
+            assert_almost_equal(
+                tree.threshold[tree_n_idx], subtree.threshold[subtree_n_idx])
+            stack.append(
+                (tree_c_left[tree_n_idx], subtree_c_left[subtree_n_idx]))
+            stack.append(
+                (tree_c_right[tree_n_idx], subtree_c_right[subtree_n_idx]))