In [25]:
import pandas as pd
import numpy as np

In [26]:
data = pd.read_csv('Boston.csv', index_col=0)
data.head()
#y = medv

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [27]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        '''
            feature_index: Defines conditions
            threshold: Defines conditions 
            left: Accesses left child
            right: Accesses right child
            info_gain: Stores informatino gain by split of particular decision node
            value: Majority class of leaf note; only requisite value for leaf node
        '''
        #Decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red

        #Leaf node
        self.value = value

In [34]:
class DecisionTreeRegressor():
    def __init__(self, min_samples_split=2, max_depth=2):
        '''
            min_samples_split:
            max_depth:
        '''
        #Init root
        self.root = None

        #Stop conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth


    def split(self, dataset, feature_index, threshold):
        '''
        Splits dataset, feature index, and theshold value and splits into left and right children
        '''

        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])

        return dataset_left, dataset_right


    def variance_reduction(self, parent, l_child, r_child):
        '''
        Computes and returns variance reduction
            - The lower the variance the more pure
            - The higher the variance reduction the more pure
        '''

        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        
        return reduction


    def calculate_leaf_value(self, y):
        '''
        Computes and returns mean of samples in lead
        '''

        val = np.mean(y)

        return val
        

    def get_best_split(self, dataset, n_features):
        '''
        Returns dictionary defining the current optimal split.
        '''

        best_split = {}
        max_var_red = -float("inf") #want to maximize information gain, so we start with lowest possible number

        #Traverse through all features
        for feature_index in range(n_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)

            #Traverse through all unique values we've encountered in the dataset
            for threshold in possible_thresholds:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)

                #Ensure chilren exist and compute information gain with entropy or gini
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:,-1], dataset_left[:,-1], dataset_right[:,-1]
                    curr_var_red = self.variance_reduction(y, left_y, right_y)

                    #If the current information gain is greater than max information gain, update best_split
                    if curr_var_red > max_var_red:
                        best_split['feature_index'] = feature_index
                        best_split['threshold'] = threshold
                        best_split['dataset_left'] = dataset_left
                        best_split['dataset_right'] = dataset_right
                        best_split['var_red'] = curr_var_red
                        max_var_red = curr_var_red

        return best_split


    def build_tree(self, dataset, curr_depth=0):
        '''
        Recursively build decision tree
        '''

        #Split features and targets
        X, y = dataset[:,:-1], dataset[:,-1]
        n_samples, n_features = np.shape(X)

        #Recursively split until stop conditions are met
        if n_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            best_split = self.get_best_split(dataset, n_features)

            #Ensure information game > 0; if 0 then we'd split a node that's already pure (consists of one class)
            if best_split['var_red'] > 0:

                #Recurse to create left and right subtrees
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)

                #Return decision node
                return Node(best_split['feature_index'], best_split['threshold'], left_subtree, right_subtree, best_split['var_red'])

        #Compute and return leaf node
        leaf_value = self.calculate_leaf_value(y)
        return Node(value=leaf_value)


    def print_tree(self, tree=None, indent=' '):
        '''
        Prints the tree for visualization
        '''

        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print(f'X_{tree.feature_index} <= {tree.threshold} ? {tree.var_red}')
            print('%sleft' % (indent), end="")
            self.print_tree(tree.left, indent+indent)
            print('%sright' % (indent), end="")
            self.print_tree(tree.right, indent+indent)


    def fit(self, X, y):

        dataset = np.concatenate((X,y), axis=1)
        self.root = self.build_tree(dataset)


    def make_prediction(self, X, tree):

        if tree.value is not None: 
            return tree.value
            
        feature_val = X[tree.feature_index]

        if feature_val <= tree.threshold:
            return self.make_prediction(X, tree.left)
        else:
            return self.make_prediction(X, tree.right)


    def predict(self, X):
        predictions = [self.make_prediction(x, self.root) for x in X]
        return predictions

In [35]:
from sklearn.model_selection import train_test_split

X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values.reshape(-1,1)

#80% train / 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [36]:
regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)
regressor.fit(X_train, y_train)
regressor.print_tree()

X_5 <= 6.824 ? 40.362680129441976
 leftX_12 <= 14.98 ? 16.655606370959667
  leftX_7 <= 1.3567 ? 9.251481799266267
    leftX_0 <= 9.2323 ? 91.576875
        left50.0
        right27.9
    rightX_5 <= 6.481 ? 4.831322602560439
        left21.255414012738857
        right26.537777777777773
  rightX_0 <= 5.44114 ? 7.5644554103251025
    leftX_4 <= 0.524 ? 3.2548294551168624
        left20.36875
        right16.137254901960784
    rightX_0 <= 24.3938 ? 2.7138327721661035
        left12.27272727272727
        right7.325
 rightX_5 <= 7.42 ? 40.02791753641263
  leftX_0 <= 6.53876 ? 17.747972931973777
    leftX_7 <= 1.8773 ? 8.593650375997191
        left45.65
        right32.04102564102565
    right12.7
  rightX_10 <= 18.0 ? 23.234062260869567
    leftX_11 <= 395.52 ? 4.756954271311546
        left46.99047619047619
        right39.25
    right28.55


In [37]:
from sklearn.metrics import mean_squared_error

y_pred = regressor.predict(X_test)
mean_squared_error(y_test, y_pred)

18.93894476142473

In [40]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)
regressor.fit(X_train, y_train)

In [43]:
y_pred = regressor.predict(X_test)

mean_squared_error(y_test, y_pred)

22.555074456229438