In [1]:
import numpy as np


class Decision_Tree_Regression(object):
    def __init__(self, max_depth=5, sample_split_thres=2):
        self.sample_split_thres = sample_split_thres
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        self.root = self.__construct(X, y)

    def __construct(self, X, y, depth=0):
        sample_num = np.shape(X)[0]
        if sample_num >= self.sample_split_thres and depth <= self.max_depth:
            best_idx, best_threshold = self._best_criteria(X, y)
            left_idx, right_idx = self._split(X[:, best_idx], best_threshold)
            left = self.__construct(X[left_idx, :], y[left_idx], depth + 1)
            right = self.__construct(X[right_idx, :], y[right_idx], depth + 1)
            return Node(best_idx, best_threshold, left, right)
        leaf_value = self._leaf_value(y)
        return Node(value=leaf_value)

    def _best_criteria(self, X, y):
        num_features = np.shape(X)[1]
        best_idx, best_threshold = None, None
        min_impurity = float('inf')
        for idx in range(num_features):
            thresholds = np.unique(X[:, idx])
            for threshold in thresholds:
                impurity = self._impurity(y, threshold, X[:, idx])
                if impurity <= min_impurity:
                    min_impurity = impurity
                    best_idx = idx
                    best_threshold = threshold
        return best_idx, best_threshold

    def _impurity(self, y, threshold, feature):
        # indices of the left and right subtree
        left_idx, right_idx = self._split(feature, threshold)
        if len(left_idx) == 0 or len(right_idx) == 0:
            return float('inf')
        left = y[left_idx]
        right = y[right_idx]
        p = len(left) / len(y)  # proportion of the left subtree
        impurity = p * self._gini(left) + (1 - p) * self._gini(right)
        return impurity

    def _gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        p = counts / len(y)
        return 1 - np.sum(np.square(p))  # gini impurity

    def _split(self, feature, threshold):
        # indices of the left subtree
        left_idx = np.argwhere(feature <= threshold).flatten()
        # indices of the right subtree
        right_idx = np.argwhere(feature > threshold).flatten()
        return left_idx, right_idx

    def _leaf_value(self, y):
        leaf_value = np.mean(y)  # mean of the target values
        return leaf_value  # return the mean of the target values

    def predict(self, X):
        # traverse the tree for each sample
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:  # leaf node
            return node.value  # return the value of the leaf node
        feature_value = x[node.idx]  # get the value of the feature
        if feature_value <= node.threshold:  # go left
            # traverse the left subtree
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)  # traverse the right subtree


class Node(object):
    def __init__(self, idx=None, threshold=None, left=None, right=None, value=None):
        self.idx = idx
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def __repr__(self):
        return "%d" % self.idx
        # "Node(idx={}, threshold={}, value={})".format(self.idx, self.threshold, self.value)

    def __str__(self):
        return self.__repr__()

# Path: Decision_Tree_Regression.ipynb


def mean_squared_error(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))


In [2]:
import numpy as np
path = 'data/'
X_train = np.load(path + 'X_train.npy')
y_train = np.load(path + 'y_train.npy')
X_val = np.load(path + 'X_val.npy')
y_val = np.load(path + 'y_val.npy')
X_test = np.load(path + 'X_test.npy')
y_test = np.load(path + 'y_test.npy')

In [7]:
min_error = float('inf')
best_depth = None
best_net = None
best_split = None

for i in range(10, 13):
    for j in range(2, 10):
        net = Decision_Tree_Regression(max_depth=i, sample_split_thres=j)
        net.fit(X_train, y_train)
        y_val_hat = net.predict(X_val)
        error = mean_squared_error(y_val, y_val_hat)
        print('depth: {}, split: {}, error: {}'.format(i, j, error))
        if error < min_error:
            min_error = error
            best_net = net
            best_depth = i
            best_split = j

depth: 10, split: 2, error: 56.78696963411451
depth: 10, split: 3, error: 56.79988706577163
depth: 10, split: 4, error: 56.30567087769273
depth: 10, split: 5, error: 56.26798248379765
depth: 10, split: 6, error: 56.31875922801334
depth: 10, split: 7, error: 56.52551989496126
depth: 10, split: 8, error: 56.50387365186174
depth: 10, split: 9, error: 56.42948998183488
depth: 11, split: 2, error: 57.04300139394967
depth: 11, split: 3, error: 56.98894406607251
depth: 11, split: 4, error: 56.1317550632788
depth: 11, split: 5, error: 56.04419688826381
depth: 11, split: 6, error: 55.947680315078784
depth: 11, split: 7, error: 56.19645383998316
depth: 11, split: 8, error: 56.153669585505305
depth: 11, split: 9, error: 56.08066101954237
depth: 12, split: 2, error: 57.13901215974242
depth: 12, split: 3, error: 57.0993010983182
depth: 12, split: 4, error: 56.12516216489861
depth: 12, split: 5, error: 55.77690033684333
depth: 12, split: 6, error: 55.64173109781663
depth: 12, split: 7, error: 55.868