In [4]:
import numpy as np
import pandas as pd

class CustomDecisionTree:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y, features_per_level):
        self.tree = self._build_tree(X, y, features_per_level, depth=0)

    def _build_tree(self, X, y, features_per_level, depth):
        if depth == self.max_depth or len(X) < self.min_samples_split:
            return np.mean(y)
        
        feature_indices = features_per_level[depth % len(features_per_level)]
        best_feature, best_threshold = self._find_best_split(X, y, feature_indices)
        
        if best_feature is None:
            return np.mean(y)
        
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold
        
        left_tree = self._build_tree(X[left_indices], y[left_indices], features_per_level, depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], features_per_level, depth + 1)
        
        return {"feature": best_feature, "threshold": best_threshold, "left": left_tree, "right": right_tree}

    def _find_best_split(self, X, y, feature_indices):
        best_feature = None
        best_threshold = None
        best_gain = -np.inf
        
        for feature in feature_indices:
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold
        
        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0
        
        p_left = len(y[left_indices]) / len(y)
        p_right = len(y[right_indices]) / len(y)
        
        return self._entropy(y) - (p_left * self._entropy(y[left_indices]) + p_right * self._entropy(y[right_indices]))

    def _entropy(self, y):
        probabilities = [np.mean(y == c) for c in np.unique(y)]
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])
    
    def _predict_one(self, x, tree):
        if not isinstance(tree, dict):
            return tree
        
        if x[tree["feature"]] <= tree["threshold"]:
            return self._predict_one(x, tree["left"])
        else:
            return self._predict_one(x, tree["right"])
    
    def to_dot(self, feature_names=None):
        dot_representation = ["digraph Tree {"]
        self._to_dot_helper(self.tree, dot_representation, feature_names)
        dot_representation.append("}")
        return "\n".join(dot_representation)
    
    def _to_dot_helper(self, tree, dot_representation, feature_names, node_id=0, parent_id=None, label=None):
        if isinstance(tree, dict):
            feature = feature_names[tree["feature"]] if feature_names else f"X{tree['feature']}"
            threshold = tree["threshold"]
            node_label = f'{feature} <= {threshold:.2f}'
            
            dot_representation.append(f'node{node_id} [label="{node_label}"];')
            if parent_id is not None:
                dot_representation.append(f'node{parent_id} -> node{node_id} [label="{label}"];')
            
            left_id = node_id * 2 + 1
            right_id = node_id * 2 + 2
            
            self._to_dot_helper(tree["left"], dot_representation, feature_names, left_id, node_id, "True")
            self._to_dot_helper(tree["right"], dot_representation, feature_names, right_id, node_id, "False")
        else:
            node_label = f"leaf: {tree:.2f}"
            dot_representation.append(f'node{node_id} [label="{node_label}", shape=box];')
            if parent_id is not None:
                dot_representation.append(f'node{parent_id} -> node{node_id} [label="{label}"];')


In [2]:

# 示例数据
data = {
    'feature1': [1, 2, 3, 4, 5],
    'feature2': [5, 4, 3, 2, 1],
    'feature3': [2, 3, 4, 5, 6],
    'target': [0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

# 特征矩阵和目标向量
X = df.drop(columns=['target']).values
y = df['target'].values

# 每一层分支的特征索引
features_per_level = [[0, 2], [1, 2], [0, 1]]

# 初始化和训练模型
clf = CustomDecisionTree(max_depth=3, min_samples_split=2)
clf.fit(X, y, features_per_level)

# 预测
predictions = clf.predict(X)
print(predictions)


[0.  1.  0.5 0.5 0. ]


In [7]:
import numpy as np
import pandas as pd

class CustomDecisionTree:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y, features_per_level):
        self.tree = self._build_tree(X, y, features_per_level, depth=0)

    def _build_tree(self, X, y, features_per_level, depth):
        if depth == self.max_depth or len(X) < self.min_samples_split:
            return np.mean(y)
        
        feature_indices = features_per_level[depth % len(features_per_level)]
        best_feature, best_threshold = self._find_best_split(X, y, feature_indices)
        
        if best_feature is None:
            return np.mean(y)
        
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold
        
        left_tree = self._build_tree(X[left_indices], y[left_indices], features_per_level, depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], features_per_level, depth + 1)
        
        return {"feature": best_feature, "threshold": best_threshold, "left": left_tree, "right": right_tree}

    def _find_best_split(self, X, y, feature_indices):
        best_feature = None
        best_threshold = None
        best_gain = -np.inf
        
        for feature in feature_indices:
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold
        
        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0
        
        p_left = len(y[left_indices]) / len(y)
        p_right = len(y[right_indices]) / len(y)
        
        return self._entropy(y) - (p_left * self._entropy(y[left_indices]) + p_right * self._entropy(y[right_indices]))

    def _entropy(self, y):
        probabilities = [np.mean(y == c) for c in np.unique(y)]
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])
    
    def _predict_one(self, x, tree):
        if not isinstance(tree, dict):
            return tree
        
        if x[tree["feature"]] <= tree["threshold"]:
            return self._predict_one(x, tree["left"])
        else:
            return self._predict_one(x, tree["right"])

    def to_dot(self, feature_names=None):
        dot_representation = ["digraph Tree {"]
        self._to_dot_helper(self.tree, dot_representation, feature_names)
        dot_representation.append("}")
        return "\n".join(dot_representation)
    
    def _to_dot_helper(self, tree, dot_representation, feature_names, node_id=0, parent_id=None, label=None):
        if isinstance(tree, dict):
            feature = feature_names[tree["feature"]] if feature_names is not None else f"X{tree['feature']}"
            threshold = tree["threshold"]
            node_label = f'{feature} <= {threshold:.2f}'
            
            dot_representation.append(f'node{node_id} [label="{node_label}"];')
            if parent_id is not None:
                dot_representation.append(f'node{parent_id} -> node{node_id} [label="{label}"];')
            
            left_id = node_id * 2 + 1
            right_id = node_id * 2 + 2
            
            self._to_dot_helper(tree["left"], dot_representation, feature_names, left_id, node_id, "True")
            self._to_dot_helper(tree["right"], dot_representation, feature_names, right_id, node_id, "False")
        else:
            node_label = f"leaf: {tree:.2f}"
            dot_representation.append(f'node{node_id} [label="{node_label}", shape=box];')
            if parent_id is not None:
                dot_representation.append(f'node{parent_id} -> node{node_id} [label="{label}"];')

# 示例数据
data = {
    'feature1': [1, 2, 3, 4, 5],
    'feature2': [5, 4, 3, 2, 1],
    'feature3': [2, 3, 4, 5, 6],
    'target': [0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

# 特征矩阵和目标向量
X = df.drop(columns=['target']).values
y = df['target'].values

# 每一层分支的特征索引
features_per_level = [[0, 2], [1, 2], [0, 1]]

# 初始化和训练模型
clf = CustomDecisionTree(max_depth=3, min_samples_split=2)
clf.fit(X, y, features_per_level)

# 将决策树转换为DOT格式
dot_representation = clf.to_dot(feature_names=df.drop(columns=['target']).columns.tolist())
print(dot_representation)


digraph Tree {
node0 [label="feature1 <= 1.00"];
node1 [label="leaf: 0.00", shape=box];
node0 -> node1 [label="True"];
node2 [label="feature2 <= 1.00"];
node0 -> node2 [label="False"];
node5 [label="leaf: 0.00", shape=box];
node2 -> node5 [label="True"];
node6 [label="feature1 <= 2.00"];
node2 -> node6 [label="False"];
node13 [label="leaf: 1.00", shape=box];
node6 -> node13 [label="True"];
node14 [label="leaf: 0.50", shape=box];
node6 -> node14 [label="False"];
}
