Import Libraries

In [33]:
import numpy as np
import pandas as pd
import copy
import json

from sklearn.model_selection import train_test_split
from functools import reduce
from operator import getitem

Read Data

In [34]:
file_path = './HW1-Dataset.csv'

In [35]:
def read_file(pth):
    return pd.read_csv(pth)

In [36]:
# read data as pandas data frame
org_data = read_file(file_path)

Preproccess 

In [37]:
# check missing values
org_data.isnull().values.any()

False

In [38]:
# replace values
procceessed_data = org_data.replace({'Taste': { 0 : "Bad" , 1 : "Good"} , 'Odor': { 0 : "Bad" , 1 : "Good"} ,
                                    'Fat ': { 0 : "Low" , 1 : "High"} , 'Turbidity': { 0 : "Low" , 1 : "High"}})

In [39]:
#check data after preproccess
procceessed_data.head()

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour,Grade (target)
0,6.6,35,Good,Bad,High,Low,254,high
1,6.6,36,Bad,Good,Low,High,253,high
2,8.5,70,Good,Good,High,High,246,low
3,9.5,34,Good,Good,Low,High,255,low
4,6.6,37,Bad,Bad,Low,Low,255,medium


Split Train , Test and validation set

In [40]:
#split to train and test data

train , test = train_test_split(procceessed_data , test_size=0.1 , random_state=1 , shuffle= True )

In [41]:
# create train and validation to find best parameters

t_train , t_valid = train_test_split(train , test_size=0.1 , random_state=1 , shuffle= True )

t_train = t_train.to_numpy()
t_valid = t_valid.to_numpy()

In [42]:
# convert to numpy values
train = train.to_numpy()
test  = test.to_numpy()

# save columns name
columns = org_data.columns.to_list()

Implement Decision Tree

In [43]:
class DecisionTree:
    def __init__(self,train_data,test_data,features,criterion,pruning=None,max_depth=5,min_data = 10,post_peruning_perc=0.1):
        
        #check measure Parameter
        if(criterion != "entropy" and criterion != "gini"):
            raise "invalid parameter for criterion"
        
        #check pruning parameter
        if(pruning != "pre" and pruning != "post" and pruning != None):
            raise "invalid parameter for pruning"
        
        if( post_peruning_perc > 1 or post_peruning_perc < 0):
            raise "post_peruning_perc must be between 0 and 1"

        # class variables
        self.train_data = copy.deepcopy(train_data) # train set
        self.test_data  = copy.deepcopy(test_data) # test set
        self.max_depth  = max_depth # maximum depth in pre_pruning
        self.min_data   = min_data  # minimum branch data remaining in pre pruning
        self.features = copy.deepcopy(features) # data features ( columns )
        self.classes = list(np.unique(train_data[:,-1])) # classes ( targets )
        self.pruning = 1 if(pruning == "pre") else 2 if(pruning == "post") else None # set pruning mode
        self.tree = {} # final result ( tree )
        self.confusion_matrix = np.zeros((len(self.classes) , len(self.classes)+1)) # confusion matrix of test data ; +1 for not seen branch
        self.branching_criterion = self.Entropy if(criterion == "entropy") else self.Gini_index # set measure function
        self.post_pruning_number = post_peruning_perc * self.train_data.shape[0] # minimum number of data should be pruned in post pruning mode
        self.rules = [] # extracted rules

        # check type of features value
        temp = train_data[0]
        for i in range(len(temp)):
            if(type(temp[i]) != str and type(temp[i]) != int and type(temp[i]) != float):
                raise f"invalid type for {self.features[i]}"
            
            
    def Entropy(self,data):
        #calculate entropy of this branch ( data )

        _ , count = np.unique(data[:,-1],return_counts=True) # count number of data per each final class
        count = count / np.sum(count) # probability of each class in this branch 
        entropy = np.multiply(count,np.log2(count)) # entropy of each class in this branch
        total_entropy = -np.sum(entropy) # entropy of branch
        return total_entropy


    def Gini_index(self,data):
        #calculate gini index of this branch ( data )

        _ , count = np.unique(data[:,-1],return_counts=True)
        count = count / np.sum(count) # probability of each class in this branch
        p = np.power(count,2)
        gini = 1 - np.sum(p)
        return gini


    def train(self):
        columns = [i for i in range(self.train_data.shape[1]-1)] # array of remaining columns ; in the begining : all columns
        self.create_tree(self.train_data,[],self.pruning==1,columns) # start bbuild tree
        
        # for post pruning
        if(self.pruning == 2):
            self.post_pruning()
    

    def create_tree(self,data,rule,prepruning,columns):
        """
        function parameters :
        data : current branch data
        rule : current path in tree => [key1 , value1 , key2 , value2 , ...]
        prepruning : flag of pre_pruning
        columns : remaining feature for branching

        creat_tree works recursively.
        branching stop condition : prepruning conditions or pure branch

        """

        #   create a copy of remaning features
        column = copy.deepcopy(columns)
        # number of data in this branch
        data_size = data.shape[0]

        # branch is pure
        if(data_size <= 1 or len(np.unique(data[:,-1])) == 1):
            
            # pure branch
            res = {"res" : { "type" : "resault"  , "result type" : "pure" , "class" : data[0,-1] , "count" : int(data_size)} }

            # add to tree
            self.add_to_tree(rule,res)
            
            return True

        if(prepruning):
            # check pre-pruning conditions : maximum tree depth or minimum data for classify
            if(len(columns) < data.shape[1] - 1 - self.max_depth or data_size < self.min_data):
                
                # additional information about this leaf
                res = { "type" : "resault"  , "result type" : "percentage"}
                tmp = {}
                
                # find percentage for each class in this branch
                vals , count = np.unique(data[:,-1] , return_counts=True)

                for i in range(len(vals)):
                    tmp[vals[i]] = {
                        "class" : vals[i] ,
                        "count" : count[i] ,
                        "percentage" : round(count[i]/np.sum(count)*100)
                    }
                res["res"] = tmp

                # add to tree
                self.add_to_tree(rule,res)   
                return True
            
        if(len(column) == 0):
            # not any more feature

            # calc result for each classes

            # additional information about this leaf
            res = { "type" : "resault"  , "result type" : "percentage"}
            tmp = {}
            
            # find percentage for each class in this branch
            vals , count = np.unique(data[:,-1] , return_counts=True)

            for i in range(len(vals)):
                tmp[vals[i]] = {
                    "class" : vals[i] ,
                    "count" : count[i] ,
                    "percentage" : round(count[i]/np.sum(count)*100)
                }
            res["res"] = tmp

            # add to tree
            self.add_to_tree(rule,res)
            return True
        
        # continue branching

        # local variable; find best feature to split data

        best_measure = 10000 # best measure value 
        best_index   = -1 # best feature index
        branch_value_index = -1 # if best feature numeric  = > should save value of split point

        # check diffrente columns for best result
        for i in column:
            # feature type
            column_type = type(data[0,i])
            # sort data according to this feature
            sorted_data = data[data[:,i].argsort()]

            # for nominal data
            if(column_type == str):
                # split data to sub branchs
                splited_data = np.split(sorted_data[:,:], np.unique(sorted_data[:,i], return_index=True)[1][0:])[1:]

                branching_measure = 0 # measure value calculated for split according to this feature

                # calc measure for each sub branch ; then calculate weighted value of measure
                for branch in splited_data:
                    if(len(branch) > 0):
                        try:
                            branching_measure += (branch.shape[0] / data_size) * self.branching_criterion(branch)
                        except:
                            pass
                # is better feature for branching ?
                if(branching_measure < best_measure):
                    best_measure = branching_measure
                    best_index   = i
            
            # for numeric data
            else:
                # calculate measure for each possible splited branches
                for j in range(1,data_size-1):

                    # split data to 2 branch
                    branch1 = sorted_data[:j,:]
                    branch2 = sorted_data[j:,:]

                    # calculate branching measur
                    branching_measure = ((j/data_size) * self.branching_criterion(branch1)) + ((len(branch2)/data_size) * self.branching_criterion(branch2))

                    # is better feature for branching ?
                    if(branching_measure < best_measure):
                        best_measure = branching_measure
                        best_index   = i
                        branch_value_index = j

        # if there is a feature that split data
        if(best_index != -1):
            
            # type of feature values
            column_type = type(data[0,best_index])
            # sort data according to this feature
            sorted_data = data[data[:,best_index].argsort()]

            # for naminal features
            if(column_type == str):
                # split data to branches according to best feature
                splited_data = np.split(sorted_data[:,:], np.unique(sorted_data[:,best_index], return_index=True)[1][0:])[1:]

                # for nominal features we should remove this feature from search domain
                column.remove(best_index)

                # calculate tree for each branches
                for branch in splited_data:
                    # check branch hasdata
                    if(len(branch) > 0):
                        val = branch[0,best_index]
                        
                        # add selected feature to rules
                        next_rule = copy.deepcopy(rule)
                        next_rule.append(self.features[best_index])
                        next_rule.append(val)

                        self.create_tree(branch,next_rule,prepruning,column)
                return True

            # for numeric features
            else:
                # split data to 2 branches according to best value of this feature
                branch1 = sorted_data[:branch_value_index,:]
                branch2 = sorted_data[branch_value_index:,:]

                # calculate split value
                val = (sorted_data[branch_value_index,best_index] + sorted_data[branch_value_index-1,best_index] ) / 2

                # check branch has data

                if(len(branch1) > 0):

                    # add this feature to rules
                    next_rule = copy.deepcopy(rule)
                    next_rule.append(self.features[best_index])
                    next_rule.append(f"{val} >")

                    self.create_tree(branch1,next_rule,prepruning,column)

                if(len(branch2)>0):

                    # add this feature to rules
                    next_rule = copy.deepcopy(rule)
                    next_rule.append(self.features[best_index])
                    next_rule.append(f"{val} <")

                    
                    self.create_tree(branch2,next_rule,prepruning,column)
                return True
            
        # if there is not any feature that split data 
        else:
            # can't find branching feature
            # add current state as leaf
            # calc result for each classes

            # additional information about this leaf
            res = {     "type" : "resault"  , "result type" : "percentage"}
            tmp = {}
            
            # find percentage for each class in this branch
            vals , count = np.unique(data[:,-1] , return_counts=True)

            for i in range(len(vals)):
                tmp[vals[i]] = {
                    "class" : vals[i] ,
                    "count" : int(count[i]) ,
                    "percentage" : round(count[i]/np.sum(count)*100)
                }
            res["res"] = tmp

            # add to tree
            self.add_to_tree(rule,res)
            return True
    
    # add leafs to tree
    def add_to_tree(self,rule,res):
        # recursively create dictionary until tree contain remaining path
        pth = res
        while(True):
            # try add leaf to tree
            try:
                reduce(getitem,rule[:-1],self.tree)[rule[-1]] = pth
                break
            # pathnot built ; 1 step goes back
            except:
                pth = {rule[-1] : pth}
                rule.pop()

    # post pruning of tree
    def post_pruning(self):

        # number of pruned data
        n = 0

        # make rules empty
        self.rules = []

        # extract rules from tree
        self.extract_rules(copy.deepcopy(self.tree),[])
        rules = self.rules
        # sort rules by number of rule data
        num = [np.sum(x[-1]) if (type(x[-1]) == list) else x[-1] for x in rules]
        rules = [x for _,x in sorted(zip(num,rules))]

        # prun to reach correct number of pruned data
        while (n < self.post_pruning_number):

            # number of data in each class after pruned
            count = [0 for i in range(len(self.classes))]

            # get rule with minimum number of data
            prun = rules[0]
            # extract rule path
            rl = prun[:-3]

            # find other rules in this path
            prun_arr = []
            for r in rules:
                if(r[:len(rl)] == rl):
                    prun_arr.append(r)
                    
            # remove rules of this path and count number of data in each class
            for r in prun_arr:
                rules.remove(r)
                if(type(r[-2]) != list):
                    count[self.classes.index(r[-2])] += int(r[-1])
                else:
                    for i in range(len(r[-1])):
                        count[i] += int(r[-1][i])
            
            # add pruned number to n
            n += np.sum(count)

            # build new rule
            rl.append("res")
            rl.append(self.classes)
            rl.append(count)
            rules.append(rl)

            # sort rules for next prun step
            num = [np.sum(x[-1]) if (type(x[-1]) == list) else x[-1] for x in rules]
            rules = [x for _,x in sorted(zip(num,rules))]

        # clear tree 
        self.tree = {}
        for r in rules:
            # path of rule
            pth = r[:-3]

            # pure rule
            if(type(r[-2]) != list):
                res = {"res" : { "type" : "resault"  , "result type" : "pure" , "class" : r[-2] , "count" : int(r[-1])} }
            # pure rule           
            elif(np.sum(r[-1]) == np.max(r[-1])):
                idx = np.argmax(r[-1])
                res = {"res" : { "type" : "resault"  , "result type" : "pure" , "class" : r[-2][idx] , "count" : int(np.max(r[-1]))} }
            # percentage rule
            else:
                res = {     "type" : "resault"  , "result type" : "percentage"}
                tmp = {}
                for i in range(len(r[-2])):
                    c = r[-2][i]
                    count = r[-1][i]
                    tmp[c] = {
                        "class" : c,
                        "count" : int(count),
                        "percentage" : round(count / np.sum(r[-1])),
                    }
                res["res"] = tmp
            self.add_to_tree(pth,res)
    
    # extract rules from tree
    def extract_rules(self,tree,rule):
        key = list(tree.keys())[0]
        # leaf 
        if(key == "res" or key == "type"):
            # pure leaf
            if(key == "res"):

                res = tree["res"]
                rule.append(key)
                rule.append(res["class"])
                rule.append(res["count"])
                self.rules.append(rule)

            # percentage rule
            else:
                res = tree["res"]
                keys = list(res.keys())
                for key in keys:
                    final_rule = copy.deepcopy(rule)
                    final_rule.append("res")
                    final_rule.append(key)
                    final_rule.append(res[key]["count"])
                    self.rules.append(final_rule)
                    
        # middle node        
        else :
            tree = tree[key]
            vals = list(tree.keys())
            for val in vals:
                t = tree[val]
                next_rule = copy.deepcopy(rule)
                next_rule.append(key)
                next_rule.append(val)
                self.extract_rules(t,next_rule)


    # calculate evaluation value for this tree and print all of them
    def evaluate_results(self):
        # true positive
        TP = 0
        # all data
        all_data = np.sum(self.confusion_matrix)

        # add correct estimated value to TP
        for i in range(len(self.classes)):
            TP += self.confusion_matrix[i,i]

        # calculate precision , recall and f1 score of each classes
        precisions = []
        recalls    = []
        f1 = []
        for i in range(len(self.classes)):
            precisions.append(self.confusion_matrix[i][i] / np.sum(self.confusion_matrix[i,:]))
            recalls.append(self.confusion_matrix[i][i] / np.sum(self.confusion_matrix[:,i]))
            f1.append((2*precisions[-1]*recalls[-1])/(precisions[-1]+recalls[-1]))
        # micro precision , recall , f1 , accuracy
        micro_measure = TP / all_data
        # macro precision
        macro_precision = np.sum(precisions) / len(precisions)
        # macro recall
        macro_recall = np.sum(recalls) / len(recalls)
        # macro f1
        macro_f1 = (2*macro_precision*macro_recall)/(macro_precision+macro_recall)
        # macro accuracy
        macro_accuracy = TP / all_data

        # print confiusion matrix
        print("\t",end="")
        for i in range(len(self.classes)):
            print(f"{self.classes[i]}\t",end="")
        print("not seen")
        for i in range(len(self.classes)):
            print(f"{self.classes[i]}\t",end="")
            for j in range(len(self.classes)+1):
                print(f"{round(self.confusion_matrix[i][j],2)}\t",end="")
            print("")
        print("____________________________________________")
            

        # print each class precision , recall and f1_score
        for i in range(len(self.classes)):
            print(f"{self.classes[i]}\t( precision = {precisions[i]} , recall = {recalls[i]} , f1_score = {f1[i]} )" , )

        # print micro scope evaluation measures
        print(f"\nmicro precision = {micro_measure}")
        print(f"micro recall    = {micro_measure}")
        print(f"micro f1_score  = {micro_measure}")
        print(f"micro accuracy  = {micro_measure}\n")

        # print macro scope evaluation measure
        print(f"macro precision = {macro_precision}")
        print(f"macro recall    = {macro_recall}")
        print(f"macro f1_score  = {macro_f1}")
        print(f"macro accuracy  = {macro_accuracy}")
        
    # test data with calculated tree
    def test(self):
        # traverse tree per test data
        for data in self.test_data:

            # copy tree in local variable
            tree = self.tree
            # target class of this test data
            d_class = data[-1]
            # index of target class 
            c_idx = self.classes.index(d_class)

            # traverse tree
            # flag : check traverse is over or not
            flag = 1
            while(flag):
                # res => leaf 
                # if tree contain res key => we have a leaf
                if(list(tree.keys()).count("res") != 0):
                    # get leaf values
                    tree = tree["res"]
                    
                    # for impure leafs
                    if(list(tree.keys()).count("class") == 0):
                        # classes in leaf
                        classes = list(tree.keys())
                        # data belong to these classes sharedly ; add percentage of each class to confiusion matrix
                        for c in classes:
                            perc = tree[c]["percentage"] / 100 
                            idx = self.classes.index(c)
                            self.confusion_matrix[idx][c_idx] += perc
                    # for pure leafs
                    else :
                        c = tree["class"]
                        idx = self.classes.index(c)
                        self.confusion_matrix[idx][c_idx] += 1
                    
                    # traverse is over
                    flag = 0
                
                # need more traverse
                else:
                    # print(tree)
                    # get branching feature
                    feature = list(tree.keys())[0]
                    # traverse tree
                    tree = tree[feature]
                    # index of branching feature
                    idx = self.features.index(feature)
                    # value of current test data in branching feature
                    val = data[idx]
                    # type of feature value
                    data_type = type(val)
                    # for nominal features
                    if(data_type == str):
                        try :
                            # in train proccess this path is seen
                            tree = tree[val]
                        except:             
                            # in train procces this path is unssen           
                            self.confusion_matrix[-1][c_idx]+= 1
                            break

                    # for numeric features
                    else:

                        keys = list(tree.keys())
                        # value of branching
                        split_val = float(keys[0].split(' ')[0])
                        # check test data value is bigger than or less/equal to split val
                        char = "<" if(val > split_val) else ">"
                        for key in keys:
                            # find branching path
                            if(key.find(char) != -1):
                                tree = tree[key]
                                break
    
    # save tree as json file
    def save_as_json(self,pth):
        with open(pth, 'w') as fp:
            json.dump(self.tree, fp , default=str)

In [44]:
dt1 = DecisionTree(t_train,t_valid,columns,"gini")
dt1.train()
dt1.test()
dt1.save_as_json("gini_complete.json")
dt1.evaluate_results()

	high	low	medium	not seen
high	15.0	0.0	0.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	4.0	0.0	30.0	0.0	
____________________________________________
high	( precision = 1.0 , recall = 0.7894736842105263 , f1_score = 0.8823529411764706 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 0.8823529411764706 , recall = 1.0 , f1_score = 0.9375 )

micro precision = 0.9583333333333334
micro recall    = 0.9583333333333334
micro f1_score  = 0.9583333333333334
micro accuracy  = 0.9583333333333334

macro precision = 0.9607843137254902
macro recall    = 0.9298245614035089
macro f1_score  = 0.9450509461426493
macro accuracy  = 0.9583333333333334


In [45]:
dt2 = DecisionTree(t_train,t_valid,columns,"entropy")
dt2.train()
dt2.test()
dt2.save_as_json("entropy_complete.json")
dt2.evaluate_results()

	high	low	medium	not seen
high	19.0	0.0	2.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	0.0	0.0	28.0	0.0	
____________________________________________
high	( precision = 0.9047619047619048 , recall = 1.0 , f1_score = 0.9500000000000001 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 1.0 , recall = 0.9333333333333333 , f1_score = 0.9655172413793104 )

micro precision = 0.9791666666666666
micro recall    = 0.9791666666666666
micro f1_score  = 0.9791666666666666
micro accuracy  = 0.9791666666666666

macro precision = 0.9682539682539683
macro recall    = 0.9777777777777779
macro f1_score  = 0.9729925684248686
macro accuracy  = 0.9791666666666666


In [46]:
dt3 = DecisionTree(t_train,t_valid,columns,"gini",pruning="pre",max_depth=4 , min_data=60)
dt3.train()
dt3.test()
dt3.save_as_json("gini_prepruned_4_60.json")
dt3.evaluate_results()

	high	low	medium	not seen
high	14.98	0.0	0.36	0.0	
low	1.42	47.0	0.41	0.0	
medium	2.6	0.0	29.23	0.0	
____________________________________________
high	( precision = 0.9765319426336376 , recall = 0.7884210526315788 , f1_score = 0.8724519510774605 )
low	( precision = 0.962523039115298 , recall = 1.0 , f1_score = 0.9809036836063862 )
medium	( precision = 0.9183160540370721 , recall = 0.9743333333333334 , f1_score = 0.945495714054666 )

micro precision = 0.9501041666666667
micro recall    = 0.9501041666666667
micro f1_score  = 0.9501041666666667
micro accuracy  = 0.9501041666666667

macro precision = 0.9524570119286692
macro recall    = 0.9209181286549707
macro f1_score  = 0.9364220865836711
macro accuracy  = 0.9501041666666667


In [47]:
dt4 = DecisionTree(t_train,t_valid,columns,"entropy",pruning="pre",max_depth=4 , min_data=60)
dt4.train()
dt4.test()
dt4.save_as_json("entropy_prepruned_4_60.json")
dt4.evaluate_results()

	high	low	medium	not seen
high	17.37	0.0	2.61	0.0	
low	0.51	47.0	0.18	0.0	
medium	1.12	0.0	27.21	0.0	
____________________________________________
high	( precision = 0.8693693693693694 , recall = 0.9142105263157894 , f1_score = 0.8912262698819907 )
low	( precision = 0.9855315579786119 , recall = 1.0 , f1_score = 0.9927130636814869 )
medium	( precision = 0.9604659371690787 , recall = 0.907 , f1_score = 0.9329675981484656 )

micro precision = 0.9539583333333335
micro recall    = 0.9539583333333335
micro f1_score  = 0.9539583333333335
micro accuracy  = 0.9539583333333335

macro precision = 0.9384556215056866
macro recall    = 0.9404035087719298
macro f1_score  = 0.9394285554130813
macro accuracy  = 0.9539583333333335


In [48]:
dt5 = DecisionTree(t_train,t_valid,columns,"gini",pruning="pre",max_depth=6 , min_data=40)
dt5.train()
dt5.test()
dt5.save_as_json("gini_prepruned_6_40.json")
dt5.evaluate_results()

	high	low	medium	not seen
high	15.22	0.0	0.36	0.0	
low	1.42	47.0	0.41	0.0	
medium	2.36	0.0	29.23	0.0	
____________________________________________
high	( precision = 0.9768934531450578 , recall = 0.8010526315789473 , f1_score = 0.8802776171197223 )
low	( precision = 0.962523039115298 , recall = 1.0 , f1_score = 0.9809036836063862 )
medium	( precision = 0.9252928141817031 , recall = 0.9743333333333334 , f1_score = 0.9491800616983277 )

micro precision = 0.9526041666666667
micro recall    = 0.9526041666666667
micro f1_score  = 0.9526041666666667
micro accuracy  = 0.9526041666666667

macro precision = 0.954903102147353
macro recall    = 0.9251286549707602
macro f1_score  = 0.939780106556452
macro accuracy  = 0.9526041666666667


In [49]:
dt6 = DecisionTree(t_train,t_valid,columns,"entropy",pruning="pre",max_depth=6 , min_data=40)
dt6.train()
dt6.test()
dt6.save_as_json("entropy_prepruned_6_40.json")
dt6.evaluate_results()

	high	low	medium	not seen
high	17.87	0.0	3.63	0.0	
low	0.51	47.0	0.18	0.0	
medium	0.62	0.0	26.19	0.0	
____________________________________________
high	( precision = 0.8311627906976744 , recall = 0.9405263157894735 , f1_score = 0.8824691358024692 )
low	( precision = 0.9855315579786119 , recall = 1.0 , f1_score = 0.9927130636814869 )
medium	( precision = 0.9768743006340918 , recall = 0.873 , f1_score = 0.9220207709910229 )

micro precision = 0.9485416666666667
micro recall    = 0.9485416666666667
micro f1_score  = 0.9485416666666667
micro accuracy  = 0.9485416666666667

macro precision = 0.931189549770126
macro recall    = 0.9378421052631577
macro f1_score  = 0.9345039880984949
macro accuracy  = 0.9485416666666667


In [50]:
dt7 = DecisionTree(t_train,t_valid,columns,"gini",pruning="pre",max_depth=8 , min_data=20)
dt7.train()
dt7.test()
dt7.save_as_json("gini_prepruned_8_20.json")
dt7.evaluate_results()

	high	low	medium	not seen
high	14.5	0.0	0.0	0.0	
low	0.82	47.0	0.08	0.0	
medium	3.68	0.0	29.92	0.0	
____________________________________________
high	( precision = 1.0 , recall = 0.7631578947368421 , f1_score = 0.8656716417910448 )
low	( precision = 0.9812108559498957 , recall = 1.0 , f1_score = 0.9905163329820864 )
medium	( precision = 0.8904761904761905 , recall = 0.9973333333333334 , f1_score = 0.9408805031446541 )

micro precision = 0.9522916666666666
micro recall    = 0.9522916666666666
micro f1_score  = 0.9522916666666666
micro accuracy  = 0.9522916666666666

macro precision = 0.9572290154753621
macro recall    = 0.9201637426900584
macro f1_score  = 0.938330490154921
macro accuracy  = 0.9522916666666666


In [51]:
dt8 = DecisionTree(t_train,t_valid,columns,"entropy",pruning="pre",max_depth=8 , min_data=20)
dt8.train()
dt8.test()
dt8.save_as_json("entropy_prepruned_8_20.json")
dt8.evaluate_results()

	high	low	medium	not seen
high	18.49	0.0	2.0	0.0	
low	0.51	47.0	0.0	0.0	
medium	0.0	0.0	28.0	0.0	
____________________________________________
high	( precision = 0.9023914104441191 , recall = 0.973157894736842 , f1_score = 0.9364396049632818 )
low	( precision = 0.9892654178067776 , recall = 1.0 , f1_score = 0.9946037456353825 )
medium	( precision = 1.0 , recall = 0.9333333333333333 , f1_score = 0.9655172413793104 )

micro precision = 0.9738541666666666
micro recall    = 0.9738541666666666
micro f1_score  = 0.9738541666666666
micro accuracy  = 0.9738541666666666

macro precision = 0.9638856094169655
macro recall    = 0.968830409356725
macro f1_score  = 0.9663516838205937
macro accuracy  = 0.9738541666666666


In [52]:
dt9 = DecisionTree(t_train,t_valid,columns,"gini",pruning="post",post_peruning_perc=0.05)
dt9.train()
dt9.save_as_json("gini_post_5.json")
dt9.test()
dt9.evaluate_results()

	high	low	medium	not seen
high	15.0	0.0	0.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	4.0	0.0	30.0	0.0	
____________________________________________
high	( precision = 1.0 , recall = 0.7894736842105263 , f1_score = 0.8823529411764706 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 0.8823529411764706 , recall = 1.0 , f1_score = 0.9375 )

micro precision = 0.9583333333333334
micro recall    = 0.9583333333333334
micro f1_score  = 0.9583333333333334
micro accuracy  = 0.9583333333333334

macro precision = 0.9607843137254902
macro recall    = 0.9298245614035089
macro f1_score  = 0.9450509461426493
macro accuracy  = 0.9583333333333334


In [53]:
dt10 = DecisionTree(t_train,t_valid,columns,"entropy",pruning="post",post_peruning_perc=0.05)
dt10.train()
dt10.test()
dt10.save_as_json("entropy_post_5.json")
dt10.evaluate_results()

	high	low	medium	not seen
high	19.0	0.0	2.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	0.0	0.0	28.0	0.0	
____________________________________________
high	( precision = 0.9047619047619048 , recall = 1.0 , f1_score = 0.9500000000000001 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 1.0 , recall = 0.9333333333333333 , f1_score = 0.9655172413793104 )

micro precision = 0.9791666666666666
micro recall    = 0.9791666666666666
micro f1_score  = 0.9791666666666666
micro accuracy  = 0.9791666666666666

macro precision = 0.9682539682539683
macro recall    = 0.9777777777777779
macro f1_score  = 0.9729925684248686
macro accuracy  = 0.9791666666666666


In [54]:
dt11 = DecisionTree(t_train,t_valid,columns,"gini",pruning="post",post_peruning_perc=0.1)
dt11.train()
dt11.test()
dt11.save_as_json("gini_post_10.json")
dt11.evaluate_results()

	high	low	medium	not seen
high	15.0	0.0	0.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	4.0	0.0	30.0	0.0	
____________________________________________
high	( precision = 1.0 , recall = 0.7894736842105263 , f1_score = 0.8823529411764706 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 0.8823529411764706 , recall = 1.0 , f1_score = 0.9375 )

micro precision = 0.9583333333333334
micro recall    = 0.9583333333333334
micro f1_score  = 0.9583333333333334
micro accuracy  = 0.9583333333333334

macro precision = 0.9607843137254902
macro recall    = 0.9298245614035089
macro f1_score  = 0.9450509461426493
macro accuracy  = 0.9583333333333334


In [55]:
dt12 = DecisionTree(t_train,t_valid,columns,"entropy",pruning="post",post_peruning_perc=0.1)
dt12.train()
dt12.test()
dt12.save_as_json("entropy_post_10.json")
dt12.evaluate_results()

	high	low	medium	not seen
high	19.0	0.0	2.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	0.0	0.0	28.0	0.0	
____________________________________________
high	( precision = 0.9047619047619048 , recall = 1.0 , f1_score = 0.9500000000000001 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 1.0 , recall = 0.9333333333333333 , f1_score = 0.9655172413793104 )

micro precision = 0.9791666666666666
micro recall    = 0.9791666666666666
micro f1_score  = 0.9791666666666666
micro accuracy  = 0.9791666666666666

macro precision = 0.9682539682539683
macro recall    = 0.9777777777777779
macro f1_score  = 0.9729925684248686
macro accuracy  = 0.9791666666666666


In [56]:
dt13 = DecisionTree(t_train,t_valid,columns,"gini",pruning="post",post_peruning_perc=0.2)
dt13.train()
dt13.test()
dt13.save_as_json("gini_post_20.json")
dt13.evaluate_results()

	high	low	medium	not seen
high	15.0	0.0	0.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	4.0	0.0	30.0	0.0	
____________________________________________
high	( precision = 1.0 , recall = 0.7894736842105263 , f1_score = 0.8823529411764706 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 0.8823529411764706 , recall = 1.0 , f1_score = 0.9375 )

micro precision = 0.9583333333333334
micro recall    = 0.9583333333333334
micro f1_score  = 0.9583333333333334
micro accuracy  = 0.9583333333333334

macro precision = 0.9607843137254902
macro recall    = 0.9298245614035089
macro f1_score  = 0.9450509461426493
macro accuracy  = 0.9583333333333334


In [57]:
dt14 = DecisionTree(t_train,t_valid,columns,"entropy",pruning="post",post_peruning_perc=0.2)
dt14.train()
dt14.test()
dt14.save_as_json("entropy_post_20.json")
dt14.evaluate_results()

	high	low	medium	not seen
high	19.0	0.0	2.0	0.0	
low	0.0	47.0	0.0	0.0	
medium	0.0	0.0	28.0	0.0	
____________________________________________
high	( precision = 0.9047619047619048 , recall = 1.0 , f1_score = 0.9500000000000001 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 1.0 , recall = 0.9333333333333333 , f1_score = 0.9655172413793104 )

micro precision = 0.9791666666666666
micro recall    = 0.9791666666666666
micro f1_score  = 0.9791666666666666
micro accuracy  = 0.9791666666666666

macro precision = 0.9682539682539683
macro recall    = 0.9777777777777779
macro f1_score  = 0.9729925684248686
macro accuracy  = 0.9791666666666666


In [62]:
dt15 = DecisionTree(train,test,columns,"entropy")
dt15.train()
dt15.test()
dt15.save_as_json("final.json")
dt15.evaluate_results()

	high	low	medium	not seen
high	26.0	0.0	0.0	0.0	
low	0.0	43.0	0.0	0.0	
medium	0.0	0.0	37.0	0.0	
____________________________________________
high	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
low	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )
medium	( precision = 1.0 , recall = 1.0 , f1_score = 1.0 )

micro precision = 1.0
micro recall    = 1.0
micro f1_score  = 1.0
micro accuracy  = 1.0

macro precision = 1.0
macro recall    = 1.0
macro f1_score  = 1.0
macro accuracy  = 1.0


In [70]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [63]:
d = org_data.replace({"Grade (target)" : {"high" : 2 , "medium" : 1 , "low" : 0}})

In [64]:
codded_data = pd.get_dummies(d).to_numpy()

In [66]:
codded_data[0]

array([  6.6,  35. ,   1. ,   0. ,   1. ,   0. , 254. ,   2. ])

In [74]:
X_train, X_test, y_train, y_test = train_test_split(codded_data[:,:-1],codded_data[:,-1] ,train_size= 0.9 , random_state=1)

In [75]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train,)

In [76]:
pred = clf.predict(X_test)

In [77]:
acc = accuracy_score(y_test, pred)

In [78]:
acc

1.0