#### 剪枝过程 优化结构化损失：
$C_\alpha(T)=C(T)+\alpha|T|$

$C(T)=\sum_{t=1}^{T}N_tH_t(T) = \sum_{t=1}^{|T|}N_t[-\sum_{k=1}^{K}\frac{N_{tk}}{N_t}log_2 \frac{N_{tk}}{N_t}]$

T为叶子结点的个数，计算在所有叶子节点上熵的加权和，然后进行叶子结点的合并，比较其损失大小，如果合并后损失更小，则两个叶子节点向上一层合并。


In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits,load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# load dataset
iris = load_iris()
df = pd.DataFrame(iris.data,dtype=int)
df['label'] = iris.target

In [3]:
df.head()

Unnamed: 0,0,1,2,3,label
0,5,3,1,0,0
1,4,3,1,0,0
2,4,3,1,0,0
3,4,3,1,0,0
4,5,3,1,0,0


In [4]:
class TreeNode:
    def __init__(self,feature_name=None,label=None,is_leaf=False,instances=None):
        
        self.childs = {}   #存储当前特征的unique值对应的子树的链接
        self.feature_name = feature_name  #当前选用的特征
        
        self.label = label   
        self.is_leaf = is_leaf     # 是否叶子节点
        self.instances = instances #在叶子节点存储对应的实体，便于后续的剪枝操作计算熵
        
    def add_child(self,val,node):
        self.childs[val] = node

In [5]:
class DecisionTree:
    def __init__(self, method="id3",epsilon=0.1,alpha=1.0):
        if method != "id3" and method != "c4.5":raise ValueError("Invalid Decision Tree method!")
        
        self.alpha = alpha
        self.method = method
        self.epsilon = epsilon
        
    def entropy(self,series):
        count = series.groupby(series).count()
        count /= count.sum()
        probs = np.array(count)
        return -np.sum(probs*np.log2(probs))
    
    def conditional_entropy(self,data):
        
        # 每个特征的值分布的权重 |Di|/D
        feature_value_cnt = data.iloc[:,0].groupby(data.iloc[:,0]).count()
        feature_value_weight = feature_value_cnt/feature_value_cnt.sum()
        
        # 对该特征，同样值筛选出来的子数据集中标签的熵，即该特征值的筛选对标签的熵有没有降低作用
        H_D_i = data.groupby(data.iloc[:,0]).apply(lambda x:self.entropy(x['label']))
        return np.sum(feature_value_weight * H_D_i)
    
    def info_gain(self,data):
        H_D = self.entropy(data['label'])
        H_D_given_X = self.conditional_entropy(data)
        g_D_A = H_D - H_D_given_X
        if self.method == "c4.5":
            H_A_D = self.entropy(data.iloc[:,0])
            g_D_A /= H_A_D
        return g_D_A
    
    def calc_info_gain(self,data,features=None):
        max_feature,max_gain = None,0
        for feature_name in features:
            gain = self.info_gain(data[[feature_name,'label']])
            if gain > max_gain:
                max_feature = feature_name
                max_gain = gain
        return max_feature,max_gain
    
    def create_tree(self,data,features):
        
        labels = data['label']
        
        # base case: no other features 
        if len(features) == 0:
            label = labels.value_counts().sort_values(ascending=False).index[0]
            return TreeNode(label=label,is_leaf=True,instances=data)
        
        # base case: one class
        if len(np.unique(labels)) == 1:
            return TreeNode(label=list(labels)[0],is_leaf=True,instances=data)
        
        #calc info gain by features
        feature_name,max_gain = self.calc_info_gain(data,features=features)
        
        # pre prune
        if max_gain < self.epsilon:
            label = labels.value_counts().sort_values(ascending=False).index[0]
            return TreeNode(label=label,is_leaf=True,instances=data)
        

        node = TreeNode(feature_name=feature_name,is_leaf=False)
        #递归调用，根据特征值的不同划分数据集并构造子树
        features -= set([feature_name])
        unique_vals = data[feature_name].unique()
        for val in unique_vals:
            sub_data = data.loc[data[feature_name]==val]
            sub_tree = self.create_tree(sub_data,features)
            node.add_child(val,sub_tree)
        
        return node
    
    def fit(self,data):
        self.columns = set(data.iloc[:,:-1].columns)
        self.tree = self.create_tree(data,self.columns)
        
    def predict(self,data):
        res = []
        for i in range(len(data)): 
            instance = data.iloc[i]
            node = self.tree
            while node.is_leaf == False:
                val = instance[node.feature_name]
                node = node.childs[val]
            res.append(node.label)
        return res
    
    def printTree(self,node,layer=0):
        print("{}layer: {} feature:{}".format('\t'*layer,layer,node.feature_name))
        for key,val in node.childs.items():
            if val.is_leaf == True:
                print("{}Leaf Node,val={}".format('\t'*layer,key))
        for key,val in node.childs.items():
            if val.is_leaf == False:
                print("{}partition node,feature:{}".format('\t'*layer,val.feature_name))
                self.printTree(val,layer+1)

#### 树结构的打印，当把epsilon调低了之后在第二层生成了两个子树
- 预剪枝：通过控制epsilon来限制不生成复杂度较高的子树
- 后剪枝：通过结构损失最小化来合并过于复杂的子树



In [6]:
train_set,test_set = train_test_split(df,test_size=0.7,random_state=6)
test_X,test_Y = test_set.iloc[:,:-1],test_set['label']

import time

stime = time.time()
model = DecisionTree(method='id3',epsilon=1,alpha=1)
model.fit(train_set)

preds = model.predict(test_X)

etime = time.time()
print("ACC score:{},cost time [{}]s".format(accuracy_score(test_Y,preds),etime-stime))
model.printTree(model.tree)

ACC score:0.9619047619047619,cost time [0.1690974235534668]s
layer: 0 feature:2
Leaf Node,val=1
Leaf Node,val=6
Leaf Node,val=4
Leaf Node,val=5
Leaf Node,val=3


In [7]:
import time

stime = time.time()
model = DecisionTree(method='id3',epsilon=0.01,alpha=1)
model.fit(train_set)

preds = model.predict(test_X)

etime = time.time()
print("ACC score:{},cost time [{}]s".format(accuracy_score(test_Y,preds),etime-stime))
model.printTree(model.tree)

ACC score:0.8571428571428571,cost time [0.21031856536865234]s
layer: 0 feature:2
Leaf Node,val=1
Leaf Node,val=6
Leaf Node,val=3
partition node,feature:0
	layer: 1 feature:0
	Leaf Node,val=5
	Leaf Node,val=6
	Leaf Node,val=7
	Leaf Node,val=4
partition node,feature:3
	layer: 1 feature:3
	Leaf Node,val=2
	Leaf Node,val=1


#### 对上面epsilon=0.01的模型进行剪枝

In [8]:
def entropy(series):
    count = series.groupby(series).count()
    count /= count.sum()
    probs = np.array(count)
    return -np.sum(probs*np.log2(probs))


def prune(root,alpha):
    if not root.childs:
        return None
    
    for node in root.childs.values():
        if node.is_leaf == False:
            prune(node,alpha)
    
    # prune when sublings are all leaf node
    PRUNE = True
    for node in root.childs.values():
        if node.is_leaf == False:
            PRUNE = False
            break
    if PRUNE:
        all_instances = []
        old_entropys = 0
        for node in root.childs.values():
            all_instances.append(node.instances)
            old_entropys += len(node.instances)*entropy(node.instances['label'])+alpha
        all_instances = pd.concat(all_instances)
        new_entropys = len(all_instances)*entropy(all_instances['label']) - alpha * (len(root.childs)-1)
        if new_entropys < old_entropys:
            root.is_leaf = True
            root.childs = {}
            root.label = all_instances['label'].value_counts().sort_values(ascending=False).index[0]
            root.instances = all_instances

#### layer 2 子树被剪掉

In [9]:
prune(model.tree,model.alpha)
model.printTree(model.tree)

stime = time.time()
preds = model.predict(test_X)
etime = time.time()
print("ACC score:{},cost time [{}]s".format(accuracy_score(test_Y,preds),etime-stime))

layer: 0 feature:2
Leaf Node,val=1
Leaf Node,val=6
Leaf Node,val=4
Leaf Node,val=5
Leaf Node,val=3
ACC score:0.9619047619047619,cost time [0.01197052001953125]s
