## Init: Import packages

In [1]:
# general
import numpy as np

## Solution 1: Splitting criteria

### a) 

- See Pen-and-paper solution in sol_trees.pdf

- Solution with Python:

In [2]:
# self-defined function for computing mse of an array
def mse(y):
    l1 = lambda var: var - y.mean()
    residuals = l1(y)
    return np.mean(residuals**2)

In [3]:
# self-defined function for computing all possible splits of a regression tree and return the best splits and empirical risk
def find_best_split(x_train,y_train):
    best_threshold = None
    min_risk = np.inf
    unique_sorted_x = np.unique(x_train)
    unique_sorted_x.sort()
    # compute the threshold with biggest margin (middle of all unique values)
    thesholds = unique_sorted_x[1:] - (unique_sorted_x[1:] - unique_sorted_x[:len(unique_sorted_x)-1])/2
    
    for t in thesholds:
        y_left_ix = x_train < t # retuns an index set for all true values
        y_left, y_right = y_train[y_left_ix], y_train[~y_left_ix] # ~ considers all other indices
        weight_left = len(y_train[y_left_ix])/len(y_train) # compute weight of left node
        t_mse = weight_left * mse(y_left) + (1-weight_left) * mse(y_right) # compute empirical risk of split t
        print("split at %.2f: empirical risk = %.2f" % (t,t_mse)) # tracking the emp. risk of each split
        
        if t_mse < min_risk: # save best split
            min_risk = t_mse
            best_threshold = t
            
    print("best split at ", best_threshold)
    return {'threshold': best_threshold, 'empirical_risk': min_risk}


In [4]:
# actually compute regression tree for your data
x = np.array([1, 2, 7, 10, 20])
y = np.array([1, 1, 0.5, 10, 11])

In [5]:
# run function
find_best_split(x, y)

split at 1.50: empirical risk = 19.14
split at 4.50: empirical risk = 13.43
split at 8.50: empirical risk = 0.13
split at 15.00: empirical risk = 12.64
best split at  8.5


{'threshold': 8.5, 'empirical_risk': 0.13333333333333333}

In [6]:
# test with log transformed feature
find_best_split(np.log(x), y)

split at 0.35: empirical risk = 19.14
split at 1.32: empirical risk = 13.43
split at 2.12: empirical risk = 0.13
split at 2.65: empirical risk = 12.64
best split at  2.1242476210246797


{'threshold': 2.1242476210246797, 'empirical_risk': 0.13333333333333333}

### b)

See sol_trees.pdf

## Solution 2:  Impurity reduction

sol_trees.pdf