In [1]:
import math
import numpy as np

In [73]:
def var(cnt, s1, s2):
    assert cnt != 0
    return s2 / cnt - (s1 / cnt) ** 2

def cov(x, y):
    return np.mean(x * y) - np.mean(x) * np.mean(y)

def corr(x, y):
    if np.var(y) == 0:
        return 0
    if np.var(x) == 0:
        return 0
    return cov(x, y) / math.sqrt(np.var(x) * np.var(y))

def generate_data(size, d):
    x = []
    x.append(np.random.normal(3, 1, size))
    x.append(np.random.normal(-2, 1, size))
    x.append(x[0] + 2 * x[1])
    x.append((x[1] + 2)**2)
    x.append(np.random.binomial(n=1, p=0.8, size=size))
    for _ in range(d - 5):
        x.append(np.random.normal(0, 0.1, size))

    def compute_y(x):
        y = 4 - 3 * x[0] * x[0] + x[2] - 0.01 * x[3] + x[1] * x[4] + np.random.normal(0, 0.1, len(x[0]))
        return y

    y = compute_y(x)
    
    return x, y

class DecisionTree():   

    def __init__(self, x, y, d, max_depth, min_sample_size):
        self.x = x
        self.y = y
        
        self.max_depth = max_depth
        self.min_sample_size = min_sample_size
        self.depth = d 
        self.child = len(self.x[0]) <= self.min_sample_size or self.depth == self.max_depth
        
        self.ind = -1
        self.threshold = 0
        
        self.result = np.mean(y)
        self.left, self.right = None, None

        if not self.child:
            self.split()
    
    def find_best_feature(self):
        all_corr = [abs(corr(xi, self.y)) for xi in self.x]
        return np.argmax(all_corr)
    
    def find_threshold_split(self, ind):
        indices = self.x[ind].argsort()
        
        for i in range(len(self.x)):
            self.x[i] = self.x[i][indices]
        self.y = self.y[indices]
        
        threshold = -1
        mn = float('inf')

        for i in range(len(self.y) - 1):
            fltr_left = self.x[ind] <= self.x[ind][i]
            fltr_right = self.x[ind] > self.x[ind][i]
            
            var_left = np.var(self.y[fltr_left])
            var_right = np.var(self.y[fltr_right])
            
            left = np.sum(fltr_left)
            right = np.sum(fltr_right)

            err_left = left / len(self.y) * var_left
            err_right = right / len(self.y) * var_right
            
            err = err_left + err_right
            
            if err < mn:
                threshold, mn = self.x[ind][i], err
        return threshold
    
     
    def split(self):
        self.ind = self.find_best_feature()
        self.threshold = self.find_threshold_split(self.ind)

        fltr_left = self.x[self.ind] <= self.threshold
        fltr_right = self.x[self.ind] > self.threshold
        
       # print('Left: ', np.sum(fltr_left), ' Right: ', np.sum(fltr_right))
        
        x_left = []
        x_right = []
        
        for i in range(len(self.x)):
            x_left.append(self.x[i][fltr_left])
            x_right.append(self.x[i][fltr_right])
            
        y_left = self.y[fltr_left]
        y_right = self.y[fltr_right]
        
        self.left = DecisionTree(x_left, y_left, self.depth + 1, self.max_depth, self.min_sample_size)
        self.right = DecisionTree(x_right, y_right, self.depth + 1, self.max_depth, self.min_sample_size)

    @staticmethod
    def predict(node, arr):
        if node.child:
            return node.result
        if arr[node.ind] <= node.threshold:
            return DecisionTree.predict(node.left, arr)
        else:
            return DecisionTree.predict(node.right, arr)
        
def compute_mse(x, y, dt):
    err = 0
    for i in range(len(y)):
        yp = DecisionTree.predict(dt, x[i])
        err += (abs(yp - y[i]) ** 2)
    err = err / len(y)
    return err


In [68]:
x, y = generate_data(100, 6)

In [69]:
x, y = generate_data(100, 6)
dt = DecisionTree(x, y, 1, 15, 4)

Left:  64  Right:  36
Left:  31  Right:  33
Left:  14  Right:  17
Left:  6  Right:  8
Left:  4  Right:  2
Left:  1  Right:  3
Left:  1  Right:  2
Left:  1  Right:  1
Left:  1  Right:  1
Left:  3  Right:  5
Left:  1  Right:  2
Left:  1  Right:  1
Left:  4  Right:  1
Left:  1  Right:  3
Left:  2  Right:  1
Left:  1  Right:  1
Left:  14  Right:  3
Left:  8  Right:  6
Left:  2  Right:  6
Left:  1  Right:  1
Left:  4  Right:  2
Left:  1  Right:  3
Left:  1  Right:  2
Left:  1  Right:  1
Left:  1  Right:  1
Left:  5  Right:  1
Left:  4  Right:  1
Left:  1  Right:  3
Left:  2  Right:  1
Left:  1  Right:  1
Left:  2  Right:  1
Left:  1  Right:  1
Left:  22  Right:  11
Left:  4  Right:  18
Left:  3  Right:  1
Left:  1  Right:  2
Left:  1  Right:  1
Left:  12  Right:  6
Left:  9  Right:  3
Left:  8  Right:  1
Left:  1  Right:  7
Left:  3  Right:  4
Left:  1  Right:  2
Left:  1  Right:  1
Left:  3  Right:  1
Left:  2  Right:  1
Left:  1  Right:  1
Left:  1  Right:  2
Left:  1  Right:  1
Left:  4 

In [70]:
def transpose(temp):
    temp = np.array(temp)
    return temp.T

In [77]:
x, y = generate_data(10000, 6)
xt = transpose(x)

dt = DecisionTree(x, y, 1, 15, 5)
loss = compute_mse(xt, y, dt)
print(loss)

TypeError: __init__() takes 4 positional arguments but 6 were given

In [71]:
x

[array([0.16486488, 0.92655235, 1.04194967, 1.18237879, 1.21923543,
        1.29473032, 1.43085787, 1.54212323, 1.61482107, 1.62430239,
        1.69041383, 1.79036196, 1.8236142 , 1.86668696, 2.03785984,
        2.11009567, 2.15310784, 2.1650782 , 2.20679968, 2.21321725,
        2.26436462, 2.35052939, 2.40213784, 2.41472197, 2.4184325 ,
        2.43034813, 2.46335908, 2.47679267, 2.50937762, 2.53203618,
        2.54263417, 2.64936148, 2.66356465, 2.70094109, 2.71693201,
        2.74110053, 2.75522951, 2.75921653, 2.77380308, 2.80817998,
        2.81368483, 2.83484676, 2.83626147, 2.86319001, 2.87183509,
        2.88300176, 2.89357346, 2.89525073, 2.92948453, 2.94871132,
        2.96848322, 3.00565539, 3.02217715, 3.05443515, 3.09719506,
        3.12275154, 3.20053642, 3.21314577, 3.24853504, 3.24994092,
        3.27816862, 3.33865007, 3.38033472, 3.38657387, 3.44451824,
        3.49474281, 3.58241465, 3.68826892, 3.69033338, 3.73495174,
        3.75170679, 3.76341729, 3.7749634 , 3.80

In [62]:
y[0]

-57.26979355503823

In [63]:
xt[0]

array([-0.01529896, -1.52818252, -3.07166401,  0.22261173,  1.        ,
        0.21872561])

In [64]:
DecisionTree.predict(dt, xt[0])

-0.4727112489968075

In [65]:
y[0]

-57.26979355503823

In [40]:
compute_mse(xt, y, dt)

14947.190370601269