# Q1: Regression Decision Tree Construction

### Group Members: Pranav Mehrotra (20CS10085) and Saransh Sharma (20CS30065)

#### Import Required Libraries. To install Seaborn type in command pip install seaborn in the terminal. 
#### To run a cell press ctr + enter and press shift + enter to run a cell and move to next cell

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import copy
#copy.copy() makes a dependent copy wherein change in one is reflected in the other copy
#copy.deepcopy() makes an independent copy, wherein change in one is not reflected in the other copy

#### Read the CSV file in the from of a dataframe


In [3]:
data = pd.read_csv("Train_B_Tree.csv")

#### Primary Analysis of the data read. 

#### Check for duplicate data. Duplicate data doesn't help in training and so needs to be dropped.

In [4]:
data[data.duplicated()==True]

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
77,425.0,106.3,0.0,153.5,16.5,852.1,887.1,3,33.4
80,425.0,106.3,0.0,153.5,16.5,852.1,887.1,3,33.4
86,362.6,189.0,0.0,164.9,11.6,944.7,755.8,3,35.3
88,362.6,189.0,0.0,164.9,11.6,944.7,755.8,3,35.3
91,362.6,189.0,0.0,164.9,11.6,944.7,755.8,3,35.3
100,425.0,106.3,0.0,153.5,16.5,852.1,887.1,7,49.2
103,425.0,106.3,0.0,153.5,16.5,852.1,887.1,7,49.2
109,362.6,189.0,0.0,164.9,11.6,944.7,755.8,7,55.9
111,362.6,189.0,0.0,164.9,11.6,944.7,755.8,7,55.9
123,425.0,106.3,0.0,153.5,16.5,852.1,887.1,28,60.29


In [5]:
data = data.drop_duplicates(keep='first')

In [6]:
data.shape

(1005, 9)

#### Dataframe Data contains the data read from csv

In [7]:
data.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [8]:
data.shape

(1005, 9)

#### The model is basically a tree containing nodes and edges. There exist two types of nodes in the tree. Leaf nodes and decision nodes. Leaf nodes are the nodes which would be helpful in case of predicting (outputting the final value) while decision nodes will represent set of conditions that would help us to make a decision about the predicted value.

In [9]:
class Node():
    def __init__(self, attribute=None, threshold=None, child_left=None, child_right=None, variance_red=None, leaf_value=None):
        
        # data members corresponding to decision nodes
        self.attribute = attribute
        self.threshold = threshold
        self.child_left = child_left
        self.child_right = child_right
        self.variance_red = variance_red
        
        #data member corresponding to a leaf node
        self.leaf_value = leaf_value

##### Kindly note: We have used the same defination of node for both the types of node. A decision node would have leaf_value = None while a leaf_node would have a numerical leaf_value. This difference would help us to differentiate between a leaf node and a decision node.

#### Class defination of a regression tree which will encapsulate all the functions and operation needed to construct a regression tree

In [145]:
class RegressionTree():
    def __init__(self, minimum_samples=2, max_depth=2): #constructor that will take two parameters
 
        self.root = None
        self.minimum_samples = minimum_samples #min number of samples that should be available for further splitting
        self.max_depth = max_depth #max- depth the tree is allowed to grow
        #these two parameters act as stopping conditions for the tree
        
    def variance_reduction(self, parent, left_branch, right_branch): #to find the reduction in variance
        
        fraction_left = len(left_branch) / len(parent) #fraction of original data in the left branch
        fraction_right = len(right_branch) / len(parent) #fraction of original data in right branch
        reduction_variance = np.var(parent) - (fraction_left * np.var(left_branch) + fraction_right * np.var(right_branch))
        #variance reduction is defined as variance of original data - weighted sum of variance of branches
        return reduction_variance
    
    def split_left_right(self, dataset, index, threshold): #to split the data in two branches depending upon attribute denoted by index and threshold
        
        left_dataset = np.array([x for x in dataset if x[index]<=threshold]) #left dataset contains all datapoints whose value of the specified attribute is less than or equal to threshold
        right_dataset = np.array([x for x in dataset if x[index]>threshold]) #right dataset contains all datapoints whose value of the specified attribute is more than threshold
        return left_dataset, right_dataset #return the two partitions
    
    def cal_leaf_node(self, y):#to calculate the value of a leaf node simple calculate mean of all the datapoints's y value at that node 
        
        leaf_val = np.mean(y)
        return leaf_val
                
    def get_best_feature(self, dataset, number_datapoints, number_attributes): # to get the feature and threshold with maximum variance reduction
        
        #initialise best_feature dictionary
        best_feature = {}
        best_feature["attribute"] = None
        best_feature["threshold"] = 0
        best_feature["dataset_left"] = None
        best_feature["dataset_right"] = None
        best_feature["variance_reduced"] = 0
        
        maximum_variance_reduction = -float("inf") #initialise the maximum variance reduction varaiable which will be sed to keep track of current maximum
        
        for features in range(number_attributes): #iterate over all features
            values = dataset[:, features] #extract the feature column
            unique_sorted_values = np.unique(values) #find sorted and unique values
            #possible threshold would be decided by taking mean of adjacent entries
            threshold_array = np.array([(unique_sorted_values[i]+unique_sorted_values[i+1])/2 for i in range(0,len(unique_sorted_values)-1)])

            for threshold in threshold_array: #iterate over all possible threshold values
                dataset_left, dataset_right = self.split_left_right(dataset, features, threshold) #split the data according to the feature and threshold
                
                if len(dataset_left)>0 and len(dataset_right)>0: #if two partitions are created
                    
                    dataset_y, dataset_left_y, dataset_right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]#extract target variable columns
                    
                    variance_reduced = self.variance_reduction(dataset_y, dataset_left_y, dataset_right_y)#calculate the reduction in variance caused by this split
                    if variance_reduced > maximum_variance_reduction:#if the variance reduction caused is more than the current maxima
                        #update the feature dictionary and store all relevant details
                        best_feature["attribute"] = features
                        best_feature["threshold"] = threshold
                        best_feature["dataset_left"] = dataset_left
                        best_feature["dataset_right"] = dataset_right
                        best_feature["variance_reduced"] = variance_reduced
                        maximum_variance_reduction = variance_reduced # update the current maxima and continue iterating over all possible combinations 
                        
        return best_feature # return the maximum variance reducing feature dictionary
    
    def construct_tree(self, dataset, current_depth=0): #function to construct tree
       
        X, y = dataset[:,:-1], dataset[:,-1] #extract feature matrix and target variable vector from dataset
        number_datapoints, number_attributes = np.shape(X) 
        current_best_feature = {} #to keep a track of the best splitting attribute for current node 
        
        if number_datapoints >= self.minimum_samples and current_depth <= self.max_depth: #if the stopping conditions are not yet reached
            current_best_feature = self.get_best_feature(dataset, number_datapoints, number_attributes) #get the best splitting attribute for the node
            if current_best_feature["variance_reduced"]>0: #if the variance reduction is positive that is the data has been splitted in 2 fractions 
                subtree_left = self.construct_tree(current_best_feature["dataset_left"], current_depth+1) #call construct tree recursively for left subtree
                subtree_right = self.construct_tree(current_best_feature["dataset_right"], current_depth+1)#call construct tree recursively for rigjt subtree
                return Node(current_best_feature["attribute"], current_best_feature["threshold"],subtree_left, subtree_right, current_best_feature["variance_reduced"])
                #return a node with left subtree as left child and right subtree as right child
        
        #in case the depth is exhausted or we are left with datapoint less than minimum_samples at a node we make that node a leaf node
        leaf_value = self.cal_leaf_node(y)#calculate the laef value 
        return Node(leaf_value = leaf_value)#return the leaf node
    
    
    def print_decision_tree(self,columns,decision_tree=None,indent=" "):
        
        

        if decision_tree.leaf_value is not None: #if decision_tree points to a leaf node simply print the value
            print("Leaf: ",round(decision_tree.leaf_value,3))

        else:#if decision tree points to a decision node
            #print the node splitting details
            print(columns[decision_tree.attribute], "==>", round(decision_tree.threshold,3), "(", round(decision_tree.variance_red,3),")")
            
            #print the left subtree by recursive calling the function and indentation increasing at every depth
            print("%sLeft: " % (indent), end="")
            self.print_decision_tree(columns, decision_tree.child_left, indent+indent)
            
            #print the right subtree by recursive calling the function and indentation increasing at every depth
            print("%sRight: " % (indent), end="")
            self.print_decision_tree(columns, decision_tree.child_right, indent+indent)
    
    def fit_model(self, X, y): #train a model to fit X and y
        
        dataset = np.concatenate((X, y), axis=1)#concatenate X and y to create the dataset
        self.root = self.construct_tree(dataset)#train the tree and store the final returned node in root
        
    def predict(self, data, decision_tree=None):#to predict target variable for a datapoint x
        print(decision_tree)
        #basic algo is to traverse the graph depending upon splitting feature and threshold values

        if decision_tree.leaf_value!=None: #if you have reached a leaf node simply return the value of the leaf
            return decision_tree.leaf_value
        
        attribute_value = data[decision_tree.attribute]#else extract the value at splitting attrribute column in x  
        if attribute_value <= decision_tree.threshold: # check if the value is less than or equal to threshold
            return self.predict(data, decision_tree.child_left)#traverse to the left subtree 
        else:
            return self.predict(data, decision_tree.child_right)#else traverse to the right subtree
        

    def post_pruning(self,decision_tree,dataset,error):
        print(type(decision_tree))
        X = dataset[:,:-1]
        y = dataset[:,-1]
        tree1 = decision_tree
        if tree1.leaf_value is not None:
            return tree1
        
        if tree1.leaf_value is None: #if the node is a decision node
            tree1.leaf_value = self.cal_leaf_node(y)#assign the corresponding leaf value
            y_pred = [self.predict(x,tree1) for x in X]#make predictions on the new tree
            
            #base condition
            if (mean_error(y_pred,y,X.shape[0])) < error:#if the tree is succesful in reducing the error
                error = mean_error(y_pred,y,X.shape[0])
                tree1.child_left = None
                tree1.child_right = None
                tree1.attribute = None
                tree1.variance_reduction=None
                return tree1#return the root which now has the particular node converted to leaf node
            
            #recursive defination
            else: 
                #in case truncating the branch doesn't help
                tree1.leaf_value=None 
            
                #left = [row for row in dataset if row[tree1.attribute]<=tree1.threshold]#access the left subtree 
                #right = [row for row in dataset if row[tree1.attribute]>tree1.threshold]#access the right subtree
                left,right = self.split_left_right(dataset,tree1.attribute,tree1.threshold)
                if tree1.child_left.leaf_value is None and len(left)>0 and tree1.child_left is not None:
                    left_new = self.post_pruning(tree1.child_left,left,error)#prune the left subtree recursively
                    if left_new is not None:
                        tree1.child_left = left_new
                
                if tree1.child_right.leaf_value is None and len(right)>0 and tree1.child_right is not None:
                    right_new = self.post_pruning(tree1.child_right,right,error)#prune the right subtree recursively
                    if right_new is not None:
                        tree1.child_right = right_new
                return tree1#create a node with the pruned left subtree and pruned right subtree as left child and right child respectively 
    

#### Error function that will help us in pruning

In [146]:
def mean_error(y_pred, y_actual, n): # to calculate root mean square error of the predictions
    
    sum=0
    for i in range(n): #iterate over all n datapoints
        sum = sum+(y_pred[i]-y_actual[i])**2 #add to sum the square of the difference between prediction and actual label
    
    sum = sum/n #take mean of the sum
    sum = np.sqrt(sum) #take square root of the error
    return sum

#### To select the maximum efficient data split we randomly split the data in 10 sample with 70-30 split and select the distribution that gives us minimum error.

In [147]:
d = data.sample(frac = 1,random_state=42) #returns a randomly jumbles data
div = int(0.7 * d.shape[0])#calculate 70 percent of the number of input datapoints
d_train, d_test = d.iloc[:div,:], d.iloc[div:,:]#split the data into test and train
d_train_x = d_train.iloc[:,:-1].values#set training data featutre matrix
d_train_y = d_train.iloc[:,-1].values.reshape(-1,1)#set training data output label
d_test_x = d_test.iloc[:,:-1].values#set test data feature matrix
d_test_y = d_test.iloc[:,-1].values.reshape(-1,1)#set test data output label

dataset_train = d_train#save the current training dataset
dataset_test = d_test#save the current test set


In [148]:
columns = data.iloc[:,:-1].columns #extract the columns of the training data
columns

Index(['cement', 'slag', 'flyash', 'water', 'superplasticizer',
       'coarseaggregate', 'fineaggregate', 'age'],
      dtype='object')

In [149]:
data_train_x = dataset_train.iloc[:,:-1].values #extract training data feature matrix after best splitting found
data_train_y = dataset_train.iloc[:,-1].values.reshape(-1,1) #extract training data target label vector after best splitting found
data_test_x = dataset_test.iloc[:,:-1].values #extract test data feature matrix after best splitting found
data_test_y = dataset_test.iloc[:,-1].values.reshape(-1,1) #extract test data target label vector after best splitting found

#### We can clearly see the optimal depth of the tree should be around 9 but our present tree has depth 20 which leads to overfitting. The train error has reduced significantly but the tree fails to generalize well on unseen data. Thus, Post-pruning is required.

In [150]:
train = [] #to store training errors
test = [] #to store test error

for i in range(10,12):
    regress_tree = RegressionTree(minimum_samples=3, max_depth=i)
    regress_tree.fit_model(data_train_x,data_train_y)#train a tree of heights 3 to 20
    
    y_pred_train = [regress_tree.predict(x,regress_tree.root) for x in data_train_x] #calculate training error
    train.append(mean_error(y_pred_train,data_train_y,data_train_x.shape[0]))
    
    y_pred_test = [regress_tree.predict(x,regress_tree.root) for x in data_test_x] #calculate test error
    test.append(mean_error(y_pred_test,data_test_y,data_test_x.shape[0]))



<__main__.Node object at 0x0000029F89E01EB0>
<__main__.Node object at 0x0000029F89E01FD0>
<__main__.Node object at 0x0000029F89A003D0>
<__main__.Node object at 0x0000029F89C8DFA0>
<__main__.Node object at 0x0000029F89C8D970>
<__main__.Node object at 0x0000029F89C8D4F0>
<__main__.Node object at 0x0000029F8B65A5E0>
<__main__.Node object at 0x0000029F8B65A250>
<__main__.Node object at 0x0000029F8B65A370>
<__main__.Node object at 0x0000029F8B65AAF0>
<__main__.Node object at 0x0000029F8B65AA00>
<__main__.Node object at 0x0000029F8B65AA30>
<__main__.Node object at 0x0000029F89E01EB0>
<__main__.Node object at 0x0000029F89E01FD0>
<__main__.Node object at 0x0000029F89A003D0>
<__main__.Node object at 0x0000029F89C8DFA0>
<__main__.Node object at 0x0000029F89C8D970>
<__main__.Node object at 0x0000029F89C8D4F0>
<__main__.Node object at 0x0000029F8B65A5E0>
<__main__.Node object at 0x0000029F8B65A250>
<__main__.Node object at 0x0000029F8B65A370>
<__main__.Node object at 0x0000029F8B65AAF0>
<__main__.

<__main__.Node object at 0x0000029F8B4BA940>
<__main__.Node object at 0x0000029F8B4BA4C0>
<__main__.Node object at 0x0000029F8BB83040>
<__main__.Node object at 0x0000029F8B65AEB0>
<__main__.Node object at 0x0000029F8B65A0D0>
<__main__.Node object at 0x0000029F89C8D580>
<__main__.Node object at 0x0000029F89C8DD00>
<__main__.Node object at 0x0000029F89C8D970>
<__main__.Node object at 0x0000029F89C8DFA0>
<__main__.Node object at 0x0000029F89C8DBB0>
<__main__.Node object at 0x0000029F89C8D250>
<__main__.Node object at 0x0000029F89C8D220>
<__main__.Node object at 0x0000029F89C8D0D0>
<__main__.Node object at 0x0000029F8B4BA940>
<__main__.Node object at 0x0000029F8B4BA4C0>
<__main__.Node object at 0x0000029F8BB83040>
<__main__.Node object at 0x0000029F8B65AEB0>
<__main__.Node object at 0x0000029F8B65A0D0>
<__main__.Node object at 0x0000029F89C8D580>
<__main__.Node object at 0x0000029F89C8DD00>
<__main__.Node object at 0x0000029F89C8D970>
<__main__.Node object at 0x0000029F89C8DFA0>
<__main__.

In [151]:
regress_tree = RegressionTree(minimum_samples=3, max_depth=20)
regress_tree.fit_model(data_train_x,data_train_y)#train a tree of heights 20
        
y_original = [regress_tree.predict(x,regress_tree.root) for x in data_test_x] #calculate test error
mean_error(y_original,data_test_y,data_test_x.shape[0])


<__main__.Node object at 0x0000029F8B9F30D0>
<__main__.Node object at 0x0000029F8B9F30A0>
<__main__.Node object at 0x0000029F8B9F3070>
<__main__.Node object at 0x0000029F89CDE490>
<__main__.Node object at 0x0000029F89CDE460>
<__main__.Node object at 0x0000029F89CDE430>
<__main__.Node object at 0x0000029F89CDE400>
<__main__.Node object at 0x0000029F89CDE370>
<__main__.Node object at 0x0000029F8B9F30D0>
<__main__.Node object at 0x0000029F8B65AF40>
<__main__.Node object at 0x0000029F89A146D0>
<__main__.Node object at 0x0000029F89A14970>
<__main__.Node object at 0x0000029F89A14520>
<__main__.Node object at 0x0000029F89A149D0>
<__main__.Node object at 0x0000029F89A14F70>
<__main__.Node object at 0x0000029F89A141C0>
<__main__.Node object at 0x0000029F89A14FA0>
<__main__.Node object at 0x0000029F89A14BE0>
<__main__.Node object at 0x0000029F8B9F30D0>
<__main__.Node object at 0x0000029F8B65AF40>
<__main__.Node object at 0x0000029F89A146D0>
<__main__.Node object at 0x0000029F89CC11F0>
<__main__.

array([6.21118875])

In [152]:
tree = copy.deepcopy(regress_tree.root)
X = dataset_test.iloc[:,:-1].values
y = dataset_test.iloc[:,-1].values.reshape(-1,1)
dataset = np.concatenate((X, y), axis=1)

pruned = regress_tree.post_pruning(tree,dataset,test[-1])


print("Error before pruning: ",mean_error(y_original,data_test_y,data_test_x.shape[0]))



<class '__main__.Node'>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000

In [153]:
y_pred_test = [regress_tree.predict(data = x,decision_tree=pruned) for x in data_test_x] 

print("Error after pruning: ",mean_error(y_pred_test,data_test_y,data_test_x.shape[0]))

<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B444250>
<__main__.Node object at 0x0000029F8B4444F0>
<__main__.Node object at 0x0000029F8BB8F1F0>
<__main__.Node object at 0x0000029F8B8AEEE0>
<__main__.Node object at 0x0000029F8B5DE370>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B444B20>
<__main__.Node object at 0x0000029F8B444550>
<__main__.Node object at 0x0000029F8B444280>
<__main__.Node object at 0x0000029F8B16A1F0>
<__main__.Node object at 0x0000029F8ACC4520>
<__main__.Node object at 0x0000029F8BB8F130>
<__main__.Node object at 0x0000029F8BB8F670>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B444B20>
<__main__.Node object at 0x0000029F8B444550>
<__main__.Node object at 0x0000029F8B444FA0>
<__main__.Node object at 0x0000029F8B444070>
<__main__.Node object at 0x0000029F8B16A2B0>
<__main__.Node object at 0x0000029F8B9F38E0>
<__main__.Node object at 0x0000029F8B444250>
<__main__.

In [154]:
regress_tree.print_decision_tree(columns,pruned)

age ==> 21.0 ( 68.325 )
 Left: superplasticizer ==> 8.35 ( 53.063 )
  Left: cement ==> 389.0 ( 27.776 )
    Left: age ==> 5.0 ( 11.956 )
        Left: Leaf:  12.75
        Right: superplasticizer ==> 2.9 ( 8.398 )
                Left: Leaf:  16.485
                Right: Leaf:  26.898
    Right: water ==> 179.35 ( 71.057 )
        Left: Leaf:  55.135
        Right: age ==> 2.0 ( 22.248 )
                Left: Leaf:  12.64
                Right: water ==> 219.0 ( 6.298 )
                                Left: age ==> 5.0 ( 3.335 )
                                                                Left: Leaf:  20.9
                                                                Right: Leaf:  40.242
                                Right: Leaf:  36.84
  Right: age ==> 5.0 ( 42.158 )
    Left: flyash ==> 86.5 ( 33.048 )
        Left: cement ==> 457.5 ( 12.416 )
                Left: superplasticizer ==> 16.2 ( 7.401 )
                                Left: water ==> 163.45 ( 6.398 )
           