In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
import scipy.stats as stats

Creating class for Decision Tree nodes

In [2]:
class Node:
    def __init__(self, attribute=" ", popular=" ", values=[], children=[]):
        self.attribute = attribute #defines which attribute is used in the node
        self.popular_parent = popular #defines popular class of parent node
        self.values = values[:] #defines which values are used to split the child node
        self.children = children[:] #defines child nodes
    
    
    #function to help pprint decide how to print the tree
    def __repr__(self, level=0):
        ret = "\t"*level+repr('Attribute = '+ self.attribute + ' Child Values = ' + str(self.values))+"\n"
        for child in self.children:
            if child is not None:
                ret += child.__repr__(level+1)
        return ret

Creating class for Decision Tree

In [3]:
class DTree:
    def __init__(self, node_split=0, prune=0, alpha=0.05):
        #using float eps to avoid operations by 0 issues
        self.eps = np.finfo(float).eps
        #initialize root of tree
        self.root = None
        
        self.node_split = node_split
        #set limits to node_split values
        valid = {0, 1} # 0 for entropy and 1 for gini index
        if node_split not in valid:
            raise ValueError("node_split must be one of %r." % valid)
        
        self.prune = prune
        #set limits to pruning values
        valid = {0, 1} # 0 for no pruning and 1 for pruning
        if prune not in valid:
            raise ValueError("prune must be one of %r." % valid)
        
        self.alpha = alpha #prob level to reject null hypothesis for pruning
        
        
    
    #function to find entropy
    def find_entropy(self, df):
        target = df.keys()[-1]
        values = df[target].unique()
        entropy = 0
        
        #calculating entropy
        for value in values: 
            prob = df[target].value_counts()[value]/len(df[target])
            entropy += -prob*np.log2(prob)
        return abs(entropy)
    
    #function to find entropy of a particular attribute
    def find_entropy_attribute(self, df, attribute):
        target = df.keys()[-1]
        values = df[target].unique()
        variables = df[attribute].unique()
        weighted_entropy = 0
        
        #calculating weighted entropy
        for variable in variables:
            variable_entropy = 0
            # calculating entropy
            for value in values:
                num = len(df[attribute][df[attribute]==variable][df[target] == value])
                den = len(df[attribute][df[attribute]==variable])
                prob = num/(den)
                variable_entropy += -prob*np.log2(prob+self.eps) #using float eps to avoid operations by 0 issues
            prob2 = den/len(df)
            weighted_entropy += -prob2*variable_entropy
        return abs(weighted_entropy)
    
    #function to find gini of a particular attribute
    def find_gini_attribute(self, df, attribute):
        values = df[attribute].unique()
        gini = 0
        
        #calculating prob squares part of gini
        for value in values: 
            prob = df[attribute].value_counts()[value]/len(df[attribute])
            gini += prob**2
        
        #calculating gini
        gini = 1 - gini
        return gini
    
    #function to find best attribute
    def find_winner(self, df):
        metric = []
        
        # for entropy and information gain
        if self.node_split == 0:
            # calculating and storing entropy based information gain for every attribute
            for key in df.keys()[:-1]:
                metric.append(self.find_entropy(df) - self.find_entropy_attribute(df,key))
            
            
        # for gini
        if self.node_split == 1:
            # calculating and storing gini index based for every attribute
            for key in df.keys()[:-1]:
                metric.append(self.find_gini_attribute(df,key))
            
        
        #returning best attribute
        return df.keys()[:-1][np.argmax(metric)]                
        
    
    #function to get a subset of the dataframe 
    def get_subtable(self, df, attribute, value):
        df = df[df[attribute] == value].reset_index(drop=True) #parting df with attribute
        df = df.drop(attribute, axis=1) #dropping attribute column
        return df
        
    #Driver code to build and prune tree
    def fit(self, df):
        #building tree
        self.root = self.build_tree(df)
        #checking prune condition and pruning
        if self.prune == 1:
            self.prune_tree([], self.root, self.root, df)
        
    
    #function to build the tree
    def build_tree(self, df, popular_parent=-1):
        
        #creating new node
        node = Node()        
        node.popular_parent = popular_parent
        
        target = df.keys()[-1]
        values = df[target].unique()
        #getting most popular target value
        popular = df[target].value_counts().index[0] 
        
        #contition when all instances have the same target
        if len(values) == 1:
            node.attribute = target
            node.popular_parent = popular_parent
            node.values.append(values[0])
            node.children = [None]
            return node
        
        #condtition when features list is empty
        if len(df.columns) == 1: #len = 1 as only target will be left
            node.attribute = target
            node.popular_parent = popular_parent
            node.values.append(popular)
            node.children = [None]
            return node
        
        #condition when df is empty
        if len(df.index) == 0:
            node.attribute = target
            node.popular_parent = popular_parent
            node.values.append(popular_parent)
            node.children = [None]
            return node
        
        #finding best attribute
        node_attr = self.find_winner(df)
        
        #finding values of attribute column
        att_val = np.unique(df[node_attr])
        
        #setting the node attribute to winning attribute 
        node.attribute = node_attr
        
        
        for value in att_val:
            subtable = self.get_subtable(df, node_attr, value)
            
            #recursive call to build tree
            child_node = self.build_tree(subtable, popular)
            
            #setting node values
            node.values.append(value)
            node.children.append(child_node)
            
        return node
    
    #function to check output of a row 
    def run_tree(self, df):
        #condition to check number of rows
        if len(df.index) > 1:
            raise ValueError("Function expects only one row")
        
        ans = -1
        current_node = self.root #keeps track of node being processed
        known_nodes = [] #stack of known nodes
        
        #parsing tree      
        while True:
            #check for columns
            if current_node.attribute not in df.columns:
                raise ValueError("Unexpected column encountered")
                break
                
            #check if we have reached leaf node and we have a prediction
            if current_node.attribute == df.keys()[-1]:
                ans = current_node.values[0]
                break
            
            #check if we have reached leaf node but we dont't have a prediction
            if current_node.attribute != df.keys()[-1] and (len(current_node.values) == 0 or len(current_node.children) == 0 or (len(current_node.children) ==1 and current_node.children[0] is None)):
                ans = current_node.popular_parent
                break
            
            #check if current node has a split suitable for our df
            if df.iloc[0][current_node.attribute] in current_node.values:
                #finding index of next node to pursue
                split_value = df.iloc[0][current_node.attribute]
                split_index = current_node.values.index(split_value)
                current_node = current_node.children[split_index]
            
            #check if current node does not have a split suitable for our df
            if df.iloc[0][current_node.attribute] not in current_node.values:
                ans = current_node.popular_parent
                break
      
        return ans
    
    #function to perform the chi2 test and delete node if required
    def chi2test(self, node, parent_node, df):
        target = df.keys()[-1]
        values = df[target].unique()
        
        p = len(df[target][df[target] == 'M'])
        n = len(df[target][df[target] == 'B'])

        target = df.keys()[-1]
        values = df[target].unique()
        variables = df[node.attribute].unique()
        delta = 0
        
        #calculate delta
        for variable in variables:
            pk = len(df[node.attribute][df[node.attribute]==variable][df[target] == 'M'])
            nk = len(df[node.attribute][df[node.attribute]==variable][df[target] == 'B'])
                                                                      
            pkhat = p*(pk+nk)/(p+n)
            nkhat = n*(pk+nk)/(p+n)
            
            delta += (((pk-pkhat)**2)/pkhat) + (((nk-nkhat)**2)/nkhat)

        
        
        #checking if h0 is accepted and if yes, deleting node
        if delta < (stats.chi2.ppf(1-self.alpha, df=(len(values)-1)))*100:
            index = parent_node.children.index(node)
            #delete node and replace it with children nodes
            parent_node.children = parent_node.children[:index] + node.children + parent_node.children[index+1:]
            parent_node.values = parent_node.values[:index] + node.values + parent_node.values[index+1:]
            
                
    
    #function to prune tree
    def prune_tree(self, visited, node, parent_node, df):
        if node is not None:
            if node not in visited:
                if node != self.root:
                    self.chi2test(node, parent_node, df)
                visited.insert(0, node)
                for child in node.children:
                    self.prune_tree(visited, child, node, df)
    
    #function to get metrics on model performance
    def metrics(self, df, baseline=False):
        #Defining all 4 cases
        tp, fp, tn, fn = 0, 0, 0, 0 
        
        acc, err, pre, rec = 0, 0, 0, 0
        
        #predicting for all rows
        if baseline:
            #predicting for baseline case
            target = df.keys()[-1]
            value = df[target].value_counts().index[0] #getting most popular value
            pred = [value for x in df.index] #putting the most popular value for every row
        else:
            #predicting for specific tree
            pred = [self.run_tree(df.iloc[[x]]) for x in df.index] 
        
        #counting each case
        for x in range(0,len(pred)):
            if pred[x] == 'M' and df.iloc[x]['Diagnosis'] == 'M':
                tp += 1
            elif pred[x] == 'M' and df.iloc[x]['Diagnosis'] == 'B':
                fn += 1
            elif pred[x] == 'B' and df.iloc[x]['Diagnosis'] == 'M':
                fp += 1
            elif pred[x] == 'B' and df.iloc[x]['Diagnosis'] == 'B':
                tn += 1
            
        
        acc = (tp + tn)/(tp + tn + fp + fn + self.eps) #adding eps to avoid operations by zero 
        err = (fp + fn)/(tp + tn + fp + fn + self.eps) #adding eps to avoid operations by zero
        pre = (tp)/(tp + fp + self.eps) #adding eps to avoid operations by zero
        rec = (tp)/(tp+fn + self.eps) #adding eps to avoid operations by zero
        
        return acc, err, pre, rec
               
    
            

Get the data

In [4]:
data_path = "ps1//Final_data//"

In [5]:
train = pd.read_csv(data_path+'wdbc_train.csv')
train.head()

Unnamed: 0,Radius,Texture,Perimeter,Area,Smoothness,Compactness,Concavity,ConcavePoints,Symmetry,FractalDimension,...,worstTexture,worstPerimeter,worstArea,worstSmoothness,worstCompactness,worstConcavity,worstConcavePoints,worstSymmetry,worstFractalDimension,Diagnosis
0,l4,l5,l4,l4,l5,l4,l4,l5,l4,l4,...,l5,l4,l4,l5,l4,l4,l4,l4,l4,M
1,l2,l3,l2,l2,l3,l3,l3,l3,l5,l4,...,l3,l2,l3,l3,l3,l3,l3,l4,l4,B
2,l4,l3,l4,l4,l3,l3,l3,l3,l2,l2,...,l3,l3,l3,l3,l3,l3,l3,l3,l3,B
3,l5,l4,l5,l5,l4,l4,l5,l5,l4,l3,...,l4,l5,l4,l3,l3,l4,l5,l3,l3,M
4,l3,l6,l3,l3,l2,l2,l2,l2,l3,l3,...,l6,l3,l3,l2,l2,l2,l2,l3,l3,B


In [6]:
dev = pd.read_csv(data_path+'wdbc_dev.csv')
dev.head()

Unnamed: 0,Radius,Texture,Perimeter,Area,Smoothness,Compactness,Concavity,ConcavePoints,Symmetry,FractalDimension,...,worstTexture,worstPerimeter,worstArea,worstSmoothness,worstCompactness,worstConcavity,worstConcavePoints,worstSymmetry,worstFractalDimension,Diagnosis
0,l3,l3,l3,l3,l3,l3,l3,l3,l2,l4,...,l4,l3,l3,l3,l3,l3,l3,l3,l3,B
1,l3,l2,l3,l3,l4,l3,l3,l3,l3,l3,...,l2,l3,l3,l3,l3,l2,l3,l3,l3,B
2,l2,l4,l2,l2,l3,l2,l3,l2,l3,l3,...,l3,l2,l3,l3,l2,l3,l2,l3,l3,B
3,l3,l4,l3,l3,l4,l3,l3,l3,l4,l4,...,l4,l3,l3,l4,l4,l4,l3,l4,l4,B
4,l3,l2,l3,l3,l3,l3,l3,l3,l3,l3,...,l2,l3,l3,l3,l3,l3,l3,l3,l4,B


In [7]:
test = pd.read_csv(data_path+'wdbc_test.csv')
test.head()

Unnamed: 0,Radius,Texture,Perimeter,Area,Smoothness,Compactness,Concavity,ConcavePoints,Symmetry,FractalDimension,...,worstTexture,worstPerimeter,worstArea,worstSmoothness,worstCompactness,worstConcavity,worstConcavePoints,worstSymmetry,worstFractalDimension,Diagnosis
0,l3,l3,l3,l3,l4,l3,l3,l3,l4,l3,...,l3,l3,l3,l4,l3,l3,l3,l4,l3,B
1,l4,l2,l4,l3,l4,l4,l3,l4,l4,l4,...,l2,l3,l3,l3,l3,l3,l4,l3,l3,B
2,l3,l3,l3,l3,l3,l3,l3,l3,l3,l3,...,l4,l3,l3,l3,l3,l3,l3,l4,l3,B
3,l3,l2,l3,l3,l3,l3,l3,l3,l1,l3,...,l2,l3,l3,l3,l3,l3,l3,l2,l3,B
4,l3,l3,l3,l3,l4,l4,l3,l3,l4,l4,...,l3,l3,l3,l3,l4,l3,l3,l4,l3,B


Creating Trees

In [8]:
Entropy_Tree = DTree(0, 0) #Creating a tree using entropy
Entropy_Tree_pruned = DTree(0, 1) #Creating a tree using entropy with pruning
Gini_Tree = DTree(1, 0) #Creating a tree using Gini
Gini_Tree_pruned = DTree(1, 1) #Creating a tree using Gini with pruning

Building Trees

In [9]:
Entropy_Tree.fit(train)

In [10]:
Entropy_Tree_pruned.fit(train)

In [11]:
Gini_Tree.fit(train)

In [12]:
Gini_Tree_pruned.fit(train)

Visualizing Trees

In [13]:
pprint(Entropy_Tree.root)

"Attribute = ConcavePoints Child Values = ['l2', 'l3', 'l4', 'l5', 'l6']"
	"Attribute = Diagnosis Child Values = ['B']"
	"Attribute = worstArea Child Values = ['l2', 'l3', 'l4']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = seArea Child Values = ['l3', 'l4', 'l5']"
			"Attribute = worstTexture Child Values = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6']"
				"Attribute = Diagnosis Child Values = ['B']"
				"Attribute = Diagnosis Child Values = ['B']"
				"Attribute = Diagnosis Child Values = ['B']"
				"Attribute = seSmoothness Child Values = ['l2', 'l3', 'l4', 'l5', 'l6']"
					"Attribute = Smoothness Child Values = ['l2', 'l3']"
						"Attribute = Texture Child Values = ['l3', 'l4']"
							"Attribute = Diagnosis Child Values = ['B']"
							"Attribute = Diagnosis Child Values = ['M']"
						"Attribute = Diagnosis Child Values = ['B']"
					"Attribute = Diagnosis Child Values = ['B']"
					"Attribute = Diagnosis Child Values = ['B']"
					"Attribute = Diagnosis Child Values =

In [14]:
pprint(Entropy_Tree_pruned.root)

"Attribute = ConcavePoints Child Values = ['B', 'l2', 'l3', 'l4', 'l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'M', 'M']"
	"Attribute = Diagnosis Child Values = ['B']"
	"Attribute = seArea Child Values = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'l3', 'l4', 'l5', 'l6', 'M']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = seSmoothness Child Values = ['l2', 'l3', 'B', 'B', 'B', 'B']"
			"Attribute = Texture Child Values = ['B', 'M']"
			"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = worstSmoothness Child Values = ['B', 'B', 'l3', 'l4', 'B']"
			"Attribute = Diagnosis Child Values = ['M']"
			"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['M']"
		"Attribute = Diagnosis Child Values = ['B']"
	"Attribute = w

In [15]:
pprint(Gini_Tree.root)

"Attribute = worstSmoothness Child Values = ['l1', 'l2', 'l3', 'l4', 'l5', 'l6']"
	"Attribute = Diagnosis Child Values = ['B']"
	"Attribute = Texture Child Values = ['l2', 'l3', 'l4', 'l5', 'l6']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = seTexture Child Values = ['l2', 'l3', 'l4', 'l5', 'l6']"
			"Attribute = Diagnosis Child Values = ['B']"
			"Attribute = Diagnosis Child Values = ['B']"
			"Attribute = Concavity Child Values = ['l2', 'l3', 'l4']"
				"Attribute = Diagnosis Child Values = ['B']"
				"Attribute = Diagnosis Child Values = ['B']"
				"Attribute = Radius Child Values = ['l3', 'l5']"
					"Attribute = Diagnosis Child Values = ['B']"
					"Attribute = Diagnosis Child Values = ['M']"
			"Attribute = Diagnosis Child Values = ['B']"
			"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Symmetry Child Values = ['l2', 'l3', 'l4', 'l5', 'l6']"
			"Attribute = Diagnosis Child Values = ['B']"
			"Attribute = seConcavity Child Values = ['l2', 'l3', 'l4',

In [16]:
pprint(Gini_Tree_pruned.root)

"Attribute = worstSmoothness Child Values = ['B', 'l2', 'l3', 'l4', 'l5', 'l6', 'l2', 'l3', 'l4', 'l5', 'l6', 'l2', 'l3', 'l4', 'l5', 'l6', 'l2', 'l3', 'l4', 'l5', 'l6', 'l2', 'l3', 'l4', 'l5', 'l6']"
	"Attribute = Diagnosis Child Values = ['B']"
	"Attribute = seTexture Child Values = ['B', 'B', 'l2', 'l3', 'l4', 'B', 'B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Radius Child Values = ['B', 'M']"
	"Attribute = Symmetry Child Values = ['B', 'l2', 'l3', 'l4', 'l6', 'l2', 'l3', 'l4', 'l5', 'M', 'B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Smoothness Child Values = ['M', 'B']"
		"Attribute = Radius Child Values = ['B', 'M']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['M']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
		"Attribute = Diagnosis Child Values = ['B']"
	"Attribute = seTexture Child Values = ['B',

Evaluating Trees

In [17]:
acc_entropy_train, err_entropy_train, pre_entropy_train, rec_entropy_train = Entropy_Tree.metrics(train)
acc_entropy_dev, err_entropy_dev, pre_entropy_dev, rec_entropy_dev = Entropy_Tree.metrics(dev)
acc_entropy_test, err_entropy_test, pre_entropy_test, rec_entropy_test = Entropy_Tree.metrics(test)

print("Metrics of Entropy tree")
print(f'Train dataset => Accuracy = {acc_entropy_train:.4f}, error = {err_entropy_train:.4f}, Precision = {pre_entropy_train:.4f}, Recall = {rec_entropy_train:.4f}')
print(f'Dev dataset => Accuracy = {acc_entropy_dev:.4f}, error = {err_entropy_dev:.4f}, Precision = {pre_entropy_dev:.4f}, Recall = {rec_entropy_dev:.4f}')
print(f'Test dataset => Accuracy = {acc_entropy_test:.4f}, error = {err_entropy_test:.4f}, Precision = {pre_entropy_test:.4f}, Recall = {rec_entropy_test:.4f}')

Metrics of Entropy tree
Train dataset => Accuracy = 1.0000, error = 0.0000, Precision = 1.0000, Recall = 1.0000
Dev dataset => Accuracy = 0.9912, error = 0.0088, Precision = 1.0000, Recall = 0.9773
Test dataset => Accuracy = 0.9561, error = 0.0439, Precision = 0.9762, Recall = 0.9111


In [18]:
acc_entropy_trainpr, err_entropy_trainpr, pre_entropy_trainpr, rec_entropy_trainpr = Entropy_Tree_pruned.metrics(train)
acc_entropy_devpr, err_entropy_devpr, pre_entropy_devpr, rec_entropy_devpr = Entropy_Tree_pruned.metrics(dev)
acc_entropy_testpr, err_entropy_testpr, pre_entropy_testpr, rec_entropy_testpr = Entropy_Tree_pruned.metrics(test)

print("Metrics of Pruned Entropy tree")
print(f'Train dataset => Accuracy = {acc_entropy_trainpr:.4f}, error = {err_entropy_trainpr:.4f}, Precision = {pre_entropy_trainpr:.4f}, Recall = {rec_entropy_trainpr:.4f}')
print(f'Dev dataset => Accuracy = {acc_entropy_devpr:.4f}, error = {err_entropy_devpr:.4f}, Precision = {pre_entropy_devpr:.4f}, Recall = {rec_entropy_devpr:.4f}')
print(f'Test dataset => Accuracy = {acc_entropy_testpr:.4f}, error = {err_entropy_testpr:.4f}, Precision = {pre_entropy_testpr:.4f}, Recall = {rec_entropy_testpr:.4f}')

Metrics of Pruned Entropy tree
Train dataset => Accuracy = 0.8886, error = 0.1114, Precision = 0.7638, Recall = 0.9238
Dev dataset => Accuracy = 0.9298, error = 0.0702, Precision = 0.8605, Recall = 0.9487
Test dataset => Accuracy = 0.8772, error = 0.1228, Precision = 0.7619, Recall = 0.8889


In [19]:
acc_gini_train, err_gini_train, pre_gini_train, rec_gini_train = Gini_Tree.metrics(train)
acc_gini_dev, err_gini_dev, pre_gini_dev, rec_gini_dev = Gini_Tree.metrics(dev)
acc_gini_test, err_gini_test, pre_gini_test, rec_gini_test = Gini_Tree.metrics(test)

print("Metrics of Gini tree")
print(f'Train dataset => Accuracy = {acc_gini_train:.4f}, error = {err_gini_train:.4f}, Precision = {pre_gini_train:.4f}, Recall = {rec_gini_train:.4f}')
print(f'Dev dataset => Accuracy = {acc_gini_dev:.4f}, error = {err_gini_dev:.4f}, Precision = {pre_gini_dev:.4f}, Recall = {rec_gini_dev:.4f}')
print(f'Test dataset => Accuracy = {acc_gini_test:.4f}, error = {err_gini_test:.4f}, Precision = {pre_gini_test:.4f}, Recall = {rec_gini_test:.4f}')

Metrics of Gini tree
Train dataset => Accuracy = 1.0000, error = 0.0000, Precision = 1.0000, Recall = 1.0000
Dev dataset => Accuracy = 0.9737, error = 0.0263, Precision = 0.9302, Recall = 1.0000
Test dataset => Accuracy = 0.9298, error = 0.0702, Precision = 0.8810, Recall = 0.9250


In [20]:
acc_gini_trainpr, err_gini_trainpr, pre_gini_trainpr, rec_gini_trainpr = Gini_Tree_pruned.metrics(train)
acc_gini_devpr, err_gini_devpr, pre_gini_devpr, rec_gini_devpr = Gini_Tree_pruned.metrics(dev)
acc_gini_testpr, err_gini_testpr, pre_gini_testpr, rec_gini_testpr = Gini_Tree_pruned.metrics(test)

print("Metrics of Pruned Gini tree")
print(f'Train dataset => Accuracy = {acc_gini_trainpr:.4f}, error = {err_gini_trainpr:.4f}, Precision = {pre_gini_trainpr:.4f}, Recall = {rec_gini_trainpr:.4f}')
print(f'Dev dataset => Accuracy = {acc_gini_devpr:.4f}, error = {err_gini_devpr:.4f}, Precision = {pre_gini_devpr:.4f}, Recall = {rec_gini_devpr:.4f}')
print(f'Test dataset => Accuracy = {acc_gini_testpr:.4f}, error = {err_gini_testpr:.4f}, Precision = {pre_gini_testpr:.4f}, Recall = {rec_gini_testpr:.4f}')

Metrics of Pruned Gini tree
Train dataset => Accuracy = 0.6490, error = 0.3510, Precision = 0.0630, Recall = 1.0000
Dev dataset => Accuracy = 0.6195, error = 0.3805, Precision = 0.0000, Recall = 0.0000
Test dataset => Accuracy = 0.6429, error = 0.3571, Precision = 0.0476, Recall = 1.0000


In [21]:
acc_entropy_train, err_entropy_train, pre_entropy_train, rec_entropy_train = Entropy_Tree.metrics(train, baseline=True)
acc_entropy_dev, err_entropy_dev, pre_entropy_dev, rec_entropy_dev = Entropy_Tree.metrics(dev, baseline=True)
acc_entropy_test, err_entropy_test, pre_entropy_test, rec_entropy_test = Entropy_Tree.metrics(test, baseline=True)

print("Metrics of Baseline tree")
print(f'Train dataset => Accuracy = {acc_entropy_train:.4f}, error = {err_entropy_train:.4f}, Precision = {pre_entropy_train:.4f}, Recall = {rec_entropy_train:.4f}')
print(f'Dev dataset => Accuracy = {acc_entropy_dev:.4f}, error = {err_entropy_dev:.4f}, Precision = {pre_entropy_dev:.4f}, Recall = {rec_entropy_dev:.4f}')
print(f'Test dataset => Accuracy = {acc_entropy_test:.4f}, error = {err_entropy_test:.4f}, Precision = {pre_entropy_test:.4f}, Recall = {rec_entropy_test:.4f}')

Metrics of Baseline tree
Train dataset => Accuracy = 0.6276, error = 0.3724, Precision = 0.0000, Recall = 0.0000
Dev dataset => Accuracy = 0.6228, error = 0.3772, Precision = 0.0000, Recall = 0.0000
Test dataset => Accuracy = 0.6316, error = 0.3684, Precision = 0.0000, Recall = 0.0000
