In [1]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log

In [2]:
dataset = pd.read_csv('iris.csv')

In [3]:
dataset.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
dataset = dataset.drop(["Id"], axis = 1)

In [5]:
dataset.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
specie = {'Iris-setosa': 1,'Iris-versicolor': 2, 'Iris-virginica': 3} 
  
dataset.Species = [specie[item] for item in dataset.Species]

In [7]:
dataset.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [8]:
column=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
df = pd.DataFrame(dataset,columns = column)

In [9]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [1]:
class DecisionTree:
    
    def entropy(df):
        class_ = df.keys()[-1]
        entropy = 0
        values = df[class_].unique()
        for value in values:
            probability_value = df[class_].value_counts()[value]/len(df[class_])
            entropy += -probability_value*np.log2(probability_value)
        return entropy

#CLASS ATTRIBUTE ENTROPY
    def attribute_entropy(df,attribute):
        Class = df.keys()[-1]   
        target_variables = df[Class].unique()
        variables = df[attribute].unique()
        entropy2 = 0
        for variable in variables:
            entropy = 0
            for target_variable in target_variables:
                frequencey_of_class = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
                node = len(df[attribute][df[attribute]==variable])
                fraction = frequencey_of_class/(node+eps)
                entropy += -fraction*log(fraction+eps)
            fraction2 = node/len(df)
            entropy2 += -fraction2*entropy
        return abs(entropy2)
    
    
#INFO GAIN
    def info_gain(df):
        Entropy_att = []
        IG = []
        for key in df.keys()[:-1]:
            i_gain = DecisionTree.entropy(df)-DecisionTree.attribute_entropy(df,key)
            print(key + ": ", i_gain)
            IG.append(i_gain)
        return df.keys()[:-1][np.argmax(IG)]


#GINI
    def gini(df):
        class_ = df.keys()[-1]
        gini = 0
        values = df[class_].unique()
        for value in values:
            probility = df[class_].value_counts()[value]/len(df[class_])
            squared = np.square(probility)
            gini = 1 - squared
        return gini


#GINI INDEX    
    def gini_index(df,attribute):
        Class = df.keys()[-1]   
        target_variables = df[Class].unique()
        variables = df[attribute].unique()
        gini2 = 0
        for variable in variables:
            gini = 0
            for target_variable in target_variables:
                frequencey_of_class = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
                node = len(df[attribute][df[attribute]==variable])
                probability = frequencey_of_class/(node+eps)
                squared = np.square(probability)
                gini += 1-(squared+eps)
            probility2 = node/len(df)
            gini2 += -probility2*gini
        return abs(gini2)
      

    
#BUILDING TREE    
    def build_tree(df,tree=None): 
        node = DecisionTree.info_gain(df)
        attribute_value = np.unique(df[node])
    
        if tree is None:                    
            tree={}
            tree[node] = {}
    
        for value in attribute_value:
            subtable = DecisionTree.get_subtable(df,node,value)
            cl_value,counts = np.unique(subtable['Species'],return_counts=True)                        
        
            if len(counts)==1:#Checking purity of subset
                tree[node][value] = cl_value[0]                                                    
            else:        
                tree[node][value] = DecisionTree.build_tree(subtable)
                   
        return tree

    def get_subtable(df, node,value):
        return df[df[node] == value].reset_index(drop=True)

In [75]:
def main(): 
    #Entropy
    entropy = DecisionTree.entropy(df)
    print("ENTROPY")
    print("-----------")
    print("Entropy: ", entropy)
    
    print("***********************************************")
    #Attribute entropy
    columns = list(df) 
    print("ATRRIBUTE ENTROPY")
    print("-----------")
    for i in columns: 
        entropy = DecisionTree.attribute_entropy(df,i)
        print(i + ": ", entropy)
    
    print("***********************************************")
    #Info Gain
    print("INFO GAIN")
    print("-----------")
    DecisionTree.info_gain(df)
    
    print("***********************************************")
    #Gini
    print("GINI")
    print("-----------")
    gini = DecisionTree.gini(df)
    print("Gini: ", entropy)
    
    print("***********************************************")
    #Gini Index
    columns = list(df) 
    print("GINI INDEX")
    print("-----------")
    for i in columns: 
        gini_index = DecisionTree.gini_index(df,i)
        print(i + ": ", gini_index)
        
    print("***********************************************")
    #Building tree
    print("BUILDING TREE")
    print("-----------")
    tree = DecisionTree.build_tree(df)
    import pprint
    pprint.pprint(tree)

if __name__ == '__main__': 
    main() 

ENTROPY
-----------
Entropy:  1.584962500721156
***********************************************
ATRRIBUTE ENTROPY
-----------
SepalLengthCm:  0.7080248798300978
SepalWidthCm:  1.0740925365975489
PetalLengthCm:  0.1386459770753558
PetalWidthCm:  0.14906466204571406
Species:  3.203426503814917e-16
***********************************************
INFO GAIN
-----------
SepalLengthCm:  0.8769376208910583
SepalWidthCm:  0.5108699641236072
PetalLengthCm:  1.4463165236458002
PetalWidthCm:  1.435897838675442
***********************************************
GINI
-----------
Gini:  3.203426503814917e-16
***********************************************
GINI INDEX
-----------
SepalLengthCm:  2.319407407407406
SepalWidthCm:  2.4703011803011803
PetalLengthCm:  2.0626666666666664
PetalWidthCm:  2.0627777777777774
Species:  1.9999999999999991
***********************************************
BUILDING TREE
-----------
SepalLengthCm:  0.8769376208910583
SepalWidthCm:  0.5108699641236072
PetalLengthCm:  1.4463