### Decision Tree Regressor Exercise

* Exercise based on [this article](https://levelup.gitconnected.com/building-a-decision-tree-from-scratch-in-python-machine-learning-from-scratch-part-ii-6e2e56265b19)

In [1]:
# create class for decision tree regressor. This makes use of the Node class with will be created later

class DecisionTreeRegressor:
    
    def fit(self, X, y, min_leaf = 5):
        self.dtree = Node(X, y, np.arange(len(y)),min_leaf)
        return self
    
    def predict(self, X):
        return self.dtree.predict(X.values)
    
# create class for node. this class represents one decision point in the tree, and also holds data it splits
    
class Node:
    
    def __init__(self, X, y, idx, min_leaf=5):
        self.X, self.y, self.idx, self.min_leaf = X, y, idx, min_leaf # idx is the indices for the rows at this node
        self.row_cnt = len(idx) # total row count at the node
        self.col_cnt = X.shape[1] # total column count at the node
        self.val = np.mean(y[idx]) # prediction at this node, which is just mean of all the rows at this node
        self.score = float('Inf') # metric that evalutes the purity of the node. variance or std for regression
        
        # this method finds the point where the data should be split. to be defined next
        # this will run recursively until the leaf node is reached
        self.find_varsplit()  
        
    def find_varsplit(self):
        
        for c in range(self.col_cnt):
            # run the find_better_split method for each column in this node
            # the method essentially checks which column is the best to split data on
            self.find_better_split(c) 
        
        if self.is_leaf: return # quit if the node is already a leaf node. stopping condition for the recursion
        X = self.split_col # the columns to be split
        
        ls = np.nonzero(X <= self.split)[0] # save indices of rows <= the split value, save it to the left side
        rs = np.nonzero(X > self.split)[0] # save indices of rows > the split value, save it to the right side
        
        self.ls = Node(self.X, self.y, self.idx[ls], self.min_leaf) # left side at the node split
        self.rs = Node(self.X, self.y, self.idx[rs], self.min_leaf) # right side at the node split

    @property
    def split_col(self): return self.X.values[self.idx, self.var_idx]
    
    # check if leaf node is reached
    @property
    def is_leaf(self): return self.score == float('Inf')
        
    # define the find_better_split that finds the best column to split on
    def find_better_split(self, var_idx):
        
        X = self.X.values[self.idx, var_idx]
        
        # within each column, split the rows into 2 groups (left & right) at each value of the column
        for r in range(self.row_cnt):
            ls = X <= X[r]
            rs = X > X[r]
            # quit the loop if either of the sub group is smaller than the minimum leaf node size
            if rs.sum() < self.min_leaf or ls.sum() < self.min_leaf: continue
            
            # calcuate the metric for each split using the find_score method (to be defined later)
            curr_score = self.find_score(ls, rs)
            
            # if the current metric is better (smaller) than the previous one:
            if curr_score < self.score:
                self.var_idx = var_idx # update the current column to be the best column
                self.score = curr_score # update the current score to be the best score
                self.split = X[r] # update the current column value to be the best split value
            
    # define the find_score method to calculate the score metric (weighted avg of std in this case)
    def find_score(self, ls, rs):
        y = self.y[self.idx]
        ls_std = y[ls].std()
        rs_std = y[rs].std()
        
        # return weigted avg of stds from both groups
        return ls_std * ls.sum() + rs_std * rs.sum()
    
    # method for prediction, using predict_row method
    def predict(self, X):    
        return np.array([self.predict_row(xi) for xi in X])
    
    # recursively compare new data with the best split values, and divide them into left and right sides
    def predict_row(self, xi):
        if self.is_leaf: return self.val
        node = self.ls if xi[self.var_idx] <= self.split else self.rs
        return node.predict_row(xi)

In [2]:
# load train, test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
X_train = train[['OverallQual', 'GrLivArea', 'GarageCars']]
y_train =train['SalePrice']

X_test = test[['OverallQual', 'GrLivArea', 'GarageCars']]
X_test = X_test.fillna(0)

In [4]:
# test results
regressor = DecisionTreeRegressor().fit(X_train, y_train)
preds = regressor.predict(X_test)

In [5]:
preds.std()

74384.50428979043

In [6]:
from sklearn.tree import DecisionTreeRegressor as skdtr

sk_tree = skdtr(min_samples_leaf = 5)
sk_tree.fit(X_train, y_train)
preds_sk = sk_tree.predict(X_test)
preds_sk.std()

73779.71919388912