In [3]:
from sklearn import tree
import numpy as np

色泽	根蒂	敲声	纹理	脐部	触感	好瓜（y）
0	0	0	0	0	0	1
1	0	0	0	0	0	1
2	0	0	0	0	0	1
0	1	1	0	1	1	0
1	1	1	1	1	1	0
2	1	1	1	1	0	0
0	2	2	2	2	1	0
1	2	2	1	2	0	0
2	2	2	2	2	1	0

In [28]:
class Node:
    def __init__(self, feature_index = None, feature_val=None, left=None, right=None,value=None):
        self.feature_index = feature_index
        self.feature_val = feature_val
        self.left = left
        self.right = right
        self.value = value



class DecisionTree(object):
    def __init__(self, criterion = 'gini', max_deepth = None, min_sample_split = None, root = None):
        self.root = root
        self.criterion = criterion
        self.max_deepth = max_deepth
        self.min_sample_split = 2

    def _caculate_gini(self, X, y, feature_index, feature_val):
        '''
        计算基尼指数，
        基尼值：计算所有标签的1-p_k^2之和,
        基尼指数：在每个属性（feature_index）下根据属性值(feature_val)分配权重计算基尼值之和
        '''
        left = X[:, feature_index] <= feature_val
        right = X[:, feature_index] > feature_val
        y_left, y_right = y[left], y[right]
        # 计算基尼值
        def gini(y_subset):
           classes, counts = np.unique(y_subset, return_counts = True)
           p_k = counts / len(y_subset)
           gini_val = 1 - sum(p_k** 2)
           return gini_val
        
        left_gini = gini(y_left)
        right_gini = gini(y_right)

        #计算加权值
        total_gini = (len(y_left)/len(y))*left_gini+(len(y_right)/len(y))*right_gini

        return total_gini
    
    def _split_node(self, X, y, criterion = 'gini'):
        '''
        将X进行划分，根据gini指数等，返回最佳划分方案，
        为一个包含gini指数，最佳划分属性编号和最佳划分属性值的的三元组
        '''
        best_criterion = float('inf') if criterion == 'gini' else -float('inf')
        best_feature_index = None
        best_feature_val = None

        _,n_features = X.shape

        for feature_index in range(n_features):
            feature_vals = np.unique(X[:,feature_index])
            # 如果判断其是gini指数
            for feature_val in feature_vals:
                if criterion == 'gini':
                    gini = self._caculate_gini(X, y, feature_index, feature_val)
                    
                    # 更新最优划分
                    if gini < best_criterion:
                        best_criterion = gini
                        best_feature_index = feature_index
                        best_feature_val = feature_val
                # 如果判断器是gain增益率
                elif criterion == 'gain':
                    gain = self._caculate_gain(X, y, feature_index, feature_val)
                    
                    # 更新最优划分
                    if gain > best_criterion:
                        best_criterion = gain
                        best_feature_index = feature_index
                        best_feature_val = feature_val

        return best_criterion, best_feature_index,best_feature_val
    
    def _most_common_label(self, y):
        return np.bincount(y).argmax()
    
    def _build_tree(self, X, y, depth = 0):
        '''
        构建树,传入特征向量X和标签向量y
        
        '''
        # 首先设置停止划分的条件，叶子节点保存输出类别，类别为当前标签集合中最常见的标签
        n_samples, n_features = X.shape# 行数正好是数据总数，列数为属性的总数目
        n_labels = len(np.unique(y))

        if n_labels == 1 or depth >= self.max_deepth or n_samples < self.min_sample_split:
            leaf_val = self._most_common_label(y)
            return Node(value=leaf_val)
        
        # 开始划分
        _, feature_index, feature_val = self._split_node(X, y, 'gini')

        # 设置划分的开始编号
        left_idx = X[:,feature_index] <= feature_val
        right_idx = X[:,feature_index] > feature_val

        # 递归处理左右子集
        left = self._build_tree(X[left_idx,:],y[left_idx], depth+1)
        right = self._build_tree(X[right_idx,:], y[right_idx],depth+1 )

        # 返回树结构
        return Node(feature_index=feature_index,feature_val=feature_val,left=left, right=right)
    

    def _traverse_tree(self, x, node):
        '''
        遍历树以匹配输入数据x的输出标签类别
        '''

        if node.value is not None:
            return node.value
        
        if x[node.feature_index] <= node.feature_val:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    
    def fit(self, X, y):
        self.root = self._build_tree(X,y) #将树保存再在oot中
    
    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]  
        

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X = np.array([
    [0, 0, 0, 0, 0, 0],  # 青绿, 蜷缩, 浊响, 清晰, 凹陷, 硬滑
    [1, 0, 0, 0, 0, 0],  # 乌黑, 蜷缩, 浊响, 清晰, 凹陷, 硬滑
    [2, 0, 0, 0, 0, 0],  # 浅白, 蜷缩, 浊响, 清晰, 凹陷, 硬滑
    [0, 1, 1, 0, 1, 1],  # 青绿, 稍蜷, 沉闷, 清晰, 稍凹, 软粘
    [1, 1, 1, 1, 1, 1],  # 乌黑, 稍蜷, 沉闷, 稍糊, 稍凹, 软粘
    [2, 1, 1, 1, 1, 0],  # 浅白, 稍蜷, 沉闷, 稍糊, 稍凹, 硬滑
    [0, 2, 2, 2, 2, 1],  # 青绿, 硬挺, 清脆, 模糊, 平坦, 软粘
    [1, 2, 2, 1, 2, 0],  # 乌黑, 硬挺, 清脆, 稍糊, 平坦, 硬滑
    [2, 2, 2, 2, 2, 1]   # 浅白, 硬挺, 清脆, 模糊, 平坦, 软粘
])

# 标签向量 y
y = np.array([1, 1, 1, 0, 0, 0, 0, 0, 0])  # 1: 好瓜, 0: 坏瓜

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

clf = DecisionTree(max_deepth=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(acc)

1.0
