#### 特征评价方式：gini index
$Gini_p = \sum_{k=1}^{K}p_k(1-p_k) = 1 - \sum_{k=1}^{K}p_k^2$

具体地，对每一个特征j，遍历其可以取的值s，最小化基尼系数

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# load dataset
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', 'label']
    # 返回数据集和每个维度的名称
    return datasets, labels

dataset,cols = create_data()
df = pd.DataFrame(dataset,columns=cols)

In [3]:
df.head()

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,label
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否


In [6]:
region_1 = df[df.iloc[:,0]=='青年']['label']

In [14]:
(region_1.groupby(region_1).count()/len(region_1)).values

array([ 0.6,  0.4])

In [15]:
class TreeNode:
    def __init__(self,feature_name=None,seperate_val=-1,label=None,is_leaf=False,instances=None):
        
        self.feature_name = feature_name  #当前选用的特征
        self.seperate_val = seperate_val  # 当前特征的切分点
        
        #非叶子节点的左右子树
        self.left = None
        self.right = None
        
        self.label = label   
        self.is_leaf = is_leaf     # 是否叶子节点
        self.instances = instances #在叶子节点存储对应的实体，便于后续的剪枝操作计算熵
        
    def add_child(self,val,node):
        self.childs[val] = node

In [18]:
class DecisionTree:
    def __init__(self):
        print("Init CART model")
        
    def GINI_index(self,data,val):
        region_1 = data.loc[data.iloc[:,0]==val]['label']
        region_2 = data.loc[data.iloc[:,0]!=val]['label']
        
        #划分区域后分别求基尼系数再加权
        prob_1 = (region_1.groupby(region_1).count()/len(region_1)).values
        gini_1 = 1 - np.sum(prob_1**2)
        
        prob_2 = (region_2.groupby(region_2).count()/len(region_2)).values
        gini_2 = 1 - np.sum(prob_2**2)
        
        gini_loss = len(region_1)/len(data)*gini_1 + len(region_2)/len(data)*gini_2
        
        return gini_loss
    
    def calc_loss(self,data,features=None):
        min_feature,min_gini_loss = None,10000
        best_seperate_val = -1
        for feature_name in features:
            for seperate_val in data[feature_name].unique():
                gini_loss = self.GINI_index(data[[feature_name,'label']],seperate_val)
                if gini_loss < min_gini_loss:
                    min_feature = feature_name
                    best_seperate_val = seperate_val
                    min_gini_loss = gini_loss
        return min_feature,best_seperate_val
    
    def create_tree(self,data,features):
        
        labels = data['label']
        
        # base case: no other features 
        if len(features) == 0:
            label = labels.value_counts().sort_values(ascending=False).index[0]
            return TreeNode(label=label,is_leaf=True,instances=data)
        
        # base case: one class
        if len(np.unique(labels)) == 1:
            return TreeNode(label=list(labels)[0],is_leaf=True,instances=data)
        
        #calc info gain by features
        feature_name,seperate_val = self.calc_loss(data,features=features)
    
        node = TreeNode(feature_name=feature_name,seperate_val=seperate_val,is_leaf=False)
        
        region1 = data.loc[data[feature_name]<=seperate_val]
        node.left = self.create_tree(region1,features)
        
        region2 = data.loc[data[feature_name]>seperate_val]
        node.right = self.create_tree(region2,features)
        
        return node
    
    def fit(self,data):
        self.columns = set(data.iloc[:,:-1].columns)
        self.tree = self.create_tree(data,self.columns)
        
    def printTree(self,node,layer=0):
        print("{}layer: {}, feature:{}, seperate value:{}".format('\t'*layer,layer,node.feature_name,node.seperate_val))
        
        if node.left.is_leaf:
            print("{}Leaf Node,val={}".format('\t'*layer,node.left.label))
        else:
            print("{}partition node,feature:{}".format('\t'*layer,node.left.feature_name))
            self.printTree(node.left,layer+1)
            
        if node.right.is_leaf:
            print("{}Leaf Node,val={}".format('\t'*layer,node.right.label))
        else:
            print("{}partition node,feature:{}".format('\t'*layer,node.right.feature_name))
            self.printTree(node.right,layer+1)

In [19]:
model = DecisionTree()
model.fit(df)

Init CART model


In [20]:
model.printTree(model.tree)

layer: 0, feature:有自己的房子, seperate value:否
partition node,feature:有工作
	layer: 1, feature:有工作, seperate value:否
	Leaf Node,val=否
	Leaf Node,val=是
Leaf Node,val=是
