#### 特征评价方式：平方误差最小化
$L = min \sum_{k=1}^{K}(y_k-f(x_k))^2$

具体地，对每一个特征j，遍历其可以取的值s，以能最小化下面这个公式的$(j,s)$对作为划分

$min_{j,s}\{ min_{c_1}\sum_{x_i\in R_1(j,s)}(y_i-c_1)^2+min_{c_2}\sum_{x_i\in R_2(j,s)}(y_i-c_2)^2 \}$

其中$R_1(j,s)$与$R_2(j,s)$是根据j和s对空间的划分，两个空间样本的均值$c_1$和$c_2$作为该空间的输出值。

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# load dataset
def create_data():
    datasets = [[1,4.5],
                [2,4.75],
                [3,4.91],
                [4,5.34],
                [5,5.80],
                [6,7.05],
                [7,7.90],
                [8,8.23],
                [9,8.70],
                [10,9.0]
               ]
    labels = ['x', 'label']
    # 返回数据集和每个维度的名称
    return datasets, labels

dataset,cols = create_data()
df = pd.DataFrame(dataset,columns=cols)

In [3]:
df.head()

Unnamed: 0,x,label
0,1,4.5
1,2,4.75
2,3,4.91
3,4,5.34
4,5,5.8


In [4]:
region_1 = df[df.iloc[:,0]<=3]['label']

In [5]:
region_1

0    4.50
1    4.75
2    4.91
Name: label, dtype: float64

In [6]:
class TreeNode:
    def __init__(self,feature_name=None,seperate_val=-1,label=None,is_leaf=False,instances=None):
        
        self.feature_name = feature_name  #当前选用的特征
        self.seperate_val = seperate_val  # 当前特征的切分点
        
        #非叶子节点的左右子树
        self.left = None
        self.right = None
        
        self.label = label   
        self.is_leaf = is_leaf     # 是否叶子节点
        self.instances = instances #在叶子节点存储对应的实体，便于后续的剪枝操作计算熵
        
    def add_child(self,val,node):
        self.childs[val] = node

In [7]:
class DecisionTree:
    def __init__(self):
        print("Init CART model")
        
    def MSE_loss(self,data,val):
        region_1 = data.loc[data.iloc[:,0]<=val]['label']
        region_2 = data.loc[data.iloc[:,0]>val]['label']
        mse_loss = ((region_1 - region_1.mean())**2).sum() + ((region_2 - region_2.mean())**2).sum()
        
        return mse_loss
    
    def calc_loss(self,data,features=None):
        min_feature,min_mse_loss = None,10000
        best_seperate_val = -1
        for feature_name in features:
            for seperate_val in data[feature_name].unique():
                mse_loss = self.MSE_loss(data[[feature_name,'label']],seperate_val)
                if mse_loss < min_mse_loss:
                    min_feature = feature_name
                    best_seperate_val = seperate_val
                    min_mse_loss = mse_loss
        return min_feature,best_seperate_val
    
    def create_tree(self,data,features):
        
        labels = data['label']
        
        # base case: no other features 
        if len(features) == 0:
            return TreeNode(label=labels.mean(),is_leaf=True,instances=data)
        
        # base case: one class
        if len(np.unique(labels)) == 1:
            return TreeNode(label=labels.mean(),is_leaf=True,instances=data)
        
        #calc info gain by features
        feature_name,seperate_val = self.calc_loss(data,features=features)
    
        node = TreeNode(feature_name=feature_name,seperate_val=seperate_val,is_leaf=False)
        
        region1 = data.loc[data[feature_name]<=seperate_val]
        node.left = self.create_tree(region1,features)
        
        region2 = data.loc[data[feature_name]>seperate_val]
        node.right = self.create_tree(region2,features)
        
        return node
    
    def fit(self,data):
        self.columns = set(data.iloc[:,:-1].columns)
        self.tree = self.create_tree(data,self.columns)
        
    def printTree(self,node,layer=0):
        print("{}layer: {}, feature:{}, seperate value:{}".format('\t'*layer,layer,node.feature_name,node.seperate_val))
        
        if node.left.is_leaf:
            print("{}Leaf Node,val={}".format('\t'*layer,node.left.label))
        else:
            print("{}partition node,feature:{}".format('\t'*layer,node.left.feature_name))
            self.printTree(node.left,layer+1)
            
        if node.right.is_leaf:
            print("{}Leaf Node,val={}".format('\t'*layer,node.right.label))
        else:
            print("{}partition node,feature:{}".format('\t'*layer,node.right.feature_name))
            self.printTree(node.right,layer+1)

In [8]:
model = DecisionTree()
model.fit(df)

Init CART model


In [9]:
model.printTree(model.tree)

layer: 0, feature:x, seperate value:5
partition node,feature:x
	layer: 1, feature:x, seperate value:3
	partition node,feature:x
		layer: 2, feature:x, seperate value:1
		Leaf Node,val=4.5
		partition node,feature:x
			layer: 3, feature:x, seperate value:2
			Leaf Node,val=4.75
			Leaf Node,val=4.91
	partition node,feature:x
		layer: 2, feature:x, seperate value:4
		Leaf Node,val=5.34
		Leaf Node,val=5.8
partition node,feature:x
	layer: 1, feature:x, seperate value:7
	partition node,feature:x
		layer: 2, feature:x, seperate value:6
		Leaf Node,val=7.05
		Leaf Node,val=7.9
	partition node,feature:x
		layer: 2, feature:x, seperate value:8
		Leaf Node,val=8.23
		partition node,feature:x
			layer: 3, feature:x, seperate value:9
			Leaf Node,val=8.7
			Leaf Node,val=9.0
