#### 数据集对标签的经验熵
$H(D)=-\sum_{k=1}^{K}\frac{|C_k|}{|D|}log_2 \frac{|C_k|}{|D|}$

#### 条件熵
即先根据A的值$D_i$划分子数据集，然后验证划分后的标签的熵值，之后对这些子数据集加权求和，权重为该类值$D_i$在数据集D中的个数,$c_{ik}$为子数据集$D_i$中对标签$k\in K$的熵
$H(D|A)= \sum_{i=1}^n \frac{|D_i|}{|D|}H(D_i) = -\sum_{i=1}^{n}\frac{|D_i|}{|D|}\sum_{n=1}^{K}\frac{|C_{ik}|}{|D_i|}log_2 \frac{|C_{ik}|}{|D_i|}$

#### 信息增益$g(D,A)$： 
即未获得特征A相关信息的原始熵值$H(D)$减去添加了A特征后新的熵值$H(D|A)$

#### 信息增益比：
$g_R(D,A) = \frac{$g(D,A)}{H_A(D)}$
其中$H_A(D)$为数据集中特征A的熵，与标签无关，用来平衡信息增益优先选择类别较多的特征的问题。

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# load dataset
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', 'label']
    # 返回数据集和每个维度的名称
    return datasets, labels

dataset,cols = create_data()
df = pd.DataFrame(dataset,columns=cols)

In [3]:
df.head()

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,label
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否


In [4]:
class TreeNode:
    def __init__(self,feature_name=None,label=None,is_leaf=False,instances=None):
        
        self.childs = {}   #存储当前特征的unique值对应的子树的链接
        self.feature_name = feature_name  #当前选用的特征
        
        self.label = label   
        self.is_leaf = is_leaf     # 是否叶子节点
        self.instances = instances #在叶子节点存储对应的实体，便于后续的剪枝操作计算熵
        
    def add_child(self,val,node):
        self.childs[val] = node

In [6]:
class DecisionTree:
    def __init__(self, method="id3",epsilon=0.1,alpha=1.0):
        if method != "id3" and method != "c4.5":raise ValueError("Invalid Decision Tree method!")
        
        self.alpha = alpha
        self.method = method
        self.epsilon = epsilon
        
    def entropy(self,series):
        count = series.groupby(series).count()
        count /= count.sum()
        probs = np.array(count)
        return -np.sum(probs*np.log2(probs))
    
    def conditional_entropy(self,data):
        
        # 每个特征的值分布的权重 |Di|/D
        feature_value_cnt = data.iloc[:,0].groupby(data.iloc[:,0]).count()
        feature_value_weight = feature_value_cnt/feature_value_cnt.sum()
        
        # 对该特征，同样值筛选出来的子数据集中标签的熵，即该特征值的筛选对标签的熵有没有降低作用
        H_D_i = data.groupby(data.iloc[:,0]).apply(lambda x:self.entropy(x['label']))
        return np.sum(feature_value_weight * H_D_i)
    
    def info_gain(self,data):
        H_D = self.entropy(data['label'])
        H_D_given_X = self.conditional_entropy(data)
        g_D_A = H_D - H_D_given_X
        if self.method == "c4.5":
            H_A_D = self.entropy(data.iloc[:,0])
            g_D_A /= H_A_D
        return g_D_A
    
    def calc_info_gain(self,data,features=None):
        max_feature,max_gain = None,0
        for feature_name in features:
            gain = self.info_gain(data[[feature_name,'label']])
            print("gain:{}  feture:{}".format(gain,feature_name))
            if gain > max_gain:
                max_feature = feature_name
                max_gain = gain
        return max_feature,max_gain
    
    def create_tree(self,data,features):
        
        labels = data['label']
        
        # base case: no other features 
        if len(features) == 0:
            label = labels.value_counts().sort_values(ascending=False).index[0]
            return TreeNode(label=label,is_leaf=True,instances=data)
        
        # base case: one class
        if len(np.unique(labels)) == 1:
            return TreeNode(label=list(labels)[0],is_leaf=True,instances=data)
        
        print("New layer construction ...")
        #calc info gain by features
        feature_name,max_gain = self.calc_info_gain(data,features=features)
        
        # pre prune
        if max_gain < self.epsilon:
            label = labels.value_counts().sort_values(ascending=False).index[0]
            return TreeNode(label=label,is_leaf=True,instances=data)
        

        node = TreeNode(feature_name=feature_name,is_leaf=False)
        #递归调用，根据特征值的不同划分数据集并构造子树
        features -= set([feature_name])
        unique_vals = data[feature_name].unique()
        for val in unique_vals:
            sub_data = data.loc[data[feature_name]==val]
            sub_tree = self.create_tree(sub_data,features)
            node.add_child(val,sub_tree)
        
        return node
    
    def fit(self,data):
        self.columns = set(data.iloc[:,:-1].columns)
        self.tree = self.create_tree(data,self.columns)
        
    def predict(self,data):
        node = self.tree
        while node.is_leaf == False:
            val = data[node.feature_name][0]
            print("compared feature:{}\ninstance value:{}".format(node.feature_name,val))
            node = node.childs[val]
        return node.label

In [7]:
model = DecisionTree(method='id3',epsilon=0.05,alpha=1)
model.fit(df)

New layer construction ...
gain:0.4199730940219749  feture:有自己的房子
gain:0.36298956253708536  feture:信贷情况
gain:0.32365019815155627  feture:有工作
gain:0.08300749985576883  feture:年龄
New layer construction ...
gain:0.47385138961004514  feture:信贷情况
gain:0.9182958340544896  feture:有工作
gain:0.2516291673878229  feture:年龄


In [8]:
print("Layer 1 feature name:",model.tree.feature_name)
print("Layer 2 feature name:",model.tree.childs['否'].feature_name)

Layer 1 feature name: 有自己的房子
Layer 2 feature name: 有工作


In [9]:
test = pd.DataFrame(columns=[u'年龄', u'有工作', u'有自己的房子', u'信贷情况'])
test.loc[0] = ['老年', '否', '否', '一般']

model.predict(test)

compared feature:有自己的房子
instance value:否
compared feature:有工作
instance value:否


'否'