# Decision Tree Regression From Scratch

## Import Section

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sks
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,median_absolute_error,mean_squared_error,mean_absolute_percentage_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
import joblib
import sys
import math 
import json
import copy
import operator



Lets load the data

In [2]:
boston = load_boston()
print(boston.data.shape)

(506, 13)


In [3]:
print(boston.feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [4]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
bos = pd.DataFrame(boston.data)
print(bos.head())

        0     1     2    3      4      5     6       7    8      9     10  \
0  0.00632  18.0  2.31  0.0  0.538  6.575  65.2  4.0900  1.0  296.0  15.3   
1  0.02731   0.0  7.07  0.0  0.469  6.421  78.9  4.9671  2.0  242.0  17.8   
2  0.02729   0.0  7.07  0.0  0.469  7.185  61.1  4.9671  2.0  242.0  17.8   
3  0.03237   0.0  2.18  0.0  0.458  6.998  45.8  6.0622  3.0  222.0  18.7   
4  0.06905   0.0  2.18  0.0  0.458  7.147  54.2  6.0622  3.0  222.0  18.7   

       11    12  
0  396.90  4.98  
1  396.90  9.14  
2  392.83  4.03  
3  394.63  2.94  
4  396.90  5.33  


In [6]:
bos['PRICE'] = boston.target

X = bos.drop('PRICE', axis = 1)
y = bos['PRICE']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 13)
(102, 13)
(404,)
(102,)


In [8]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,15.02340,0.0,18.10,0.0,0.6140,5.304,97.3,2.1007,24.0,666.0,20.2,349.48,24.91
1,0.62739,0.0,8.14,0.0,0.5380,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47
2,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83
3,7.05042,0.0,18.10,0.0,0.6140,6.103,85.1,2.0218,24.0,666.0,20.2,2.52,23.29
4,0.72580,0.0,8.14,0.0,0.5380,5.727,69.5,3.7965,4.0,307.0,21.0,390.95,11.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.17120,0.0,8.56,0.0,0.5200,5.836,91.9,2.2110,5.0,384.0,20.9,395.67,18.66
400,0.29916,20.0,6.96,0.0,0.4640,5.856,42.1,4.4290,3.0,223.0,18.6,388.65,13.00
401,0.01501,80.0,2.01,0.0,0.4350,6.635,29.7,8.3440,4.0,280.0,17.0,390.94,5.99
402,11.16040,0.0,18.10,0.0,0.7400,6.629,94.6,2.1247,24.0,666.0,20.2,109.85,23.27


In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# lets convert them data frames an just to see how tree is constructed

In [10]:
X_train = pd.DataFrame(data = X_train, columns = boston.feature_names)
X_test = pd.DataFrame(data = X_test, columns = boston.feature_names)

In [11]:
eps = np.finfo(float).eps

# Now lets write Decision Tree Regressor from Scratch

In [19]:
class DecisionTreeRegressor:

	def __init__(self, max_depth = None, min_sample_leaf = 3):

		self.depth = 0 #Depth of the tree
		self.max_depth = max_depth	#Maximum depth of the tree
		self.min_sample_leaf = min_sample_leaf	#Minimum number of samples for each node
		self.coefficient_of_variation = 10 	#Stopping Criterion
		self.features = list
		self.num_feats = int 
		self.train_size = int 

	def fit(self, X_dt, y_dt):
		self.features = list(X_dt.columns)
		self.train_size = X_dt.shape[0]
		self.num_feats = X_dt.shape[1]
		df = X_dt.copy()
		df['target'] = y_dt.copy()
		#print(df['target'].isnull().sum())
		#Builds Decision Tree
		self.tree = self._build_tree(df)

		print("\nDecision Tree(depth = {}) ".format(self.depth))
		print("********** TREE ************")
		filehandler = open('tree.txt', 'w')
		filehandler.write(str(self.tree))
		filehandler.close()
		filehandler = open("tree.txt", "r")
		treetoview = json.loads(filehandler.read().replace("'",'"'))
		print(json.dumps(treetoview, indent = 3))
		filehandler.close()

	def _build_tree(self, df, tree = None):

		"""
			Args:
				df: current number of rows available for splitting(decision making)
		"""

		#Get feature with minimum score
		feature, cutoff = self._find_best_split(df)
		#print('Best Split feature {} and its cutoff : {} '.format(feature,cutoff))
		if cutoff is None:
			return tree

		#Initialization of tree
		if tree is None:
			tree = {}
			tree[feature] = {}

		#Left Child
		new_df = self._split_rows(df, feature, cutoff, operator.le)
		
		target_coef_of_var = self._coef_ov(new_df['target'])

		self.depth += 1

		if(target_coef_of_var < self.coefficient_of_variation or len(new_df) <= self.min_sample_leaf): #pure group
			#print(" target_coef_of_var < self.coefficient_of_variation or len(new_df) <= self.min_sample_leaf and mean is {}".format (new_df['target'].mean()))
			#print(new_df['target'])
			tree[feature]['<=' + str(cutoff)] = new_df['target'].mean()
		else:
			if self.max_depth is not None and self.depth >= self.max_depth:
				#print(" self.max_depth is not None and self.depth >= self.max_depth and mean is {}".format (new_df['target'].mean()))
				#print(new_df['target'])
				tree[feature]['<=' + str(cutoff)] = new_df['target'].mean()
			else:
				tree[feature]['<=' + str(cutoff)] = self._build_tree(new_df)


		#Right Child
		new_df = self._split_rows(df, feature, cutoff, operator.gt)

		target_coef_of_var = self._coef_ov(new_df['target'])

		if(target_coef_of_var < self.coefficient_of_variation or len(new_df) <= self.min_sample_leaf): #pure group
			#print(" target_coef_of_var < self.coefficient_of_variation or len(new_df) <= self.min_sample_leaf and mean is {}".format (new_df['target'].mean()))
			#print(new_df['target'])
			tree[feature]['>' + str(cutoff)] = new_df['target'].mean()
		else:
			if self.max_depth is not None and self.depth >= self.max_depth:
				#print(" self.max_depth is not None and self.depth >= self.max_depth and mean is {}".format (new_df['target'].mean()))
				#print(new_df['target'])
				tree[feature]['>' + str(cutoff)] = new_df['target'].mean()
			else:
				tree[feature]['>' + str(cutoff)] = self._build_tree(new_df)

		return tree

	def _coef_ov(self, y):

		""" calculates coefficient of variation:
		    COV = (Mean of y / Standard Deviation of y) * 100
		"""
		if(y.std() == 0):
			return 0
		coef_of_var = (y.mean()/y.std()) * 100

		return coef_of_var

	def _split_rows(self, df, feature, feat_val, operation ):

		""" split rows based on given criterion """

		return df[operation(df[feature], feat_val)].reset_index(drop = True)

	def _find_best_split(self, df):

		"""
			Finds the column to split on first.
		"""

		best_feature = str
		cutoff = None
		best_score = float('inf')


		for feature in list(df.columns[:-1]):

			score, threshold = self._find_feature_split(feature, df)

			if score < best_score:
				best_feature = feature
				best_score = score
				cutoff = threshold
		
		return best_feature, cutoff 

	def _find_feature_split(self, feature, df):

		best_score = float('inf')
		cutoff = float

		for val in df[feature]:
			left_child = df[feature][df[feature] <= val] 
			right_child = df[feature][df[feature] > val]

			if(len(left_child) > 0 and len(right_child) > 0):
				score = self._find_score(df, left_child, right_child)

				if score < best_score:
					best_score = score
					cutoff = val

		return best_score, cutoff


	def _find_score(self, df, lhs, rhs):

		y = df['target']

		lhs_std = y.iloc[lhs.index].std()
		rhs_std = y.iloc[rhs.index].std()

		if(np.isnan(lhs_std)):
			lhs_std = 0
		if(np.isnan(rhs_std)):
			rhs_std = 0

		return lhs_std * lhs.sum() + rhs_std * rhs.sum()

	def _predict_target(self, feature_lookup, x, tree):

		for node in tree.keys():
			val = x[node]
			if type(val) == str:
				tree = tree[node][val]
			else:
				cutoff = str(list(tree[node].keys())[0]).split('<=')[1]

				if(val <= float(cutoff)):	#Left Child
					tree = tree[node]['<='+cutoff]
				else:						#Right Child
					tree = tree[node]['>'+cutoff]

			prediction = str

			if type(tree) is dict:
				prediction = self._predict_target(feature_lookup, x, tree)
			else:
				predicton = tree 
				return predicton

		return prediction   


	def predict(self, X):

		results = []
		feature_lookup = {key: i for i, key in enumerate(list(X.columns))}
		
		for index in range(len(X)):

			results.append(self._predict_target(feature_lookup, X.iloc[index], self.tree))

		return np.array(results)

In [None]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)


In [21]:
print("\nTrain RMSE : {}".format(mean_squared_error(y_train, dt_reg.predict(X_train))))
print("\nTest RMSE: {}".format(mean_squared_error(y_test, dt_reg.predict(X_test))))


Train RMSE : 20.47033415841584

Test RMSE: 28.8308251633987


In [None]:
dt_reg = DecisionTreeRegressor(max_depth = 100,min_sample_leaf = 40 )
dt_reg.fit(X_train, y_train)


In [23]:
print("\nTrain RMSE : {}".format(mean_squared_error(y_train, dt_reg.predict(X_train))))
print("\nTest RMSE: {}".format(mean_squared_error(y_test, dt_reg.predict(X_test))))


Train RMSE : 27.30759897445947

Test RMSE: 23.899976468147138
