In [148]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from anytree import Node,RenderTree
from anytree.exporter import DotExporter

In [149]:
#This function is used to calculate the entropy 
def entropy(count):
    tot=count.sum()
    prob=count/tot
    ent=(prob*-np.log2(prob)).sum()
    return ent

In [140]:
"""This function is used to get the mid value of each feature data on the basis of mid value, split the data into two array
and calculate the information gain and gain ratio.
""""
def split_data(x,train_data,f,infn):
    D=len(train_data)
    list_inf=[]
    for i in range(0,len(x)-1):
        x1=(x[i]+x[i+1])/2
        data_below=[j for j in train_data if j[f]<x1]   #split the data into one list which is smaller than mid value
        data_above=[j for j in train_data if j[f]>=x1]  #split the data into 2nd list  which is greater thanand equal to mid value
        data_below1=np.array(data_below)   
        data_above1=np.array(data_above)
        u,y1_count=np.unique(data_below1[:,-1],return_counts=True)
        v,y2_count=np.unique(data_above1[:,-1],return_counts=True)
        tot1=y1_count.sum()
        tot2=y2_count.sum()
        infd1=entropy(y1_count)           # Calculate entropy of left side array  
        infd2=entropy(y2_count)           # Calculate entropy of right side array
        infF=(abs(tot1)/abs(D))*infd1+(abs(tot2)/abs(D))*infd2 
        inform_gain=infn-infF             #Calculate Information gain    
        split_info=-((abs(tot1)/abs(D))*np.log2(abs(tot1)/abs(D))+(abs(tot2)/abs(D))*np.log2(abs(tot2)/abs(D)))    #Calculate split info   
        gain=inform_gain/split_info       # Calculate gain ratio
        list_inf.append([gain,f,x1])                            
    return list_inf     
  

In [150]:
"""This function is used to calculate the parent node entropy and select the maximum gain ratio of each feature"""
def gain_func(train_data,f):    #train_data is parent data and f is a single feature ont the basis of gain ratio calculate 
    u,y_count=np.unique(train_data[:,-1],return_counts=True)  
    infn=entropy(y_count)
    x1=np.unique(train_data[:,f])    #get the unique value of  each feature
    split=split_data(x1,train_data,f,infn)
    gain=[]
    try:
        gain=max(split)
    except:
        pass
    return gain
          

In [151]:
'''This function is used to split the parent node after getting one maximum gain ratio '''

def split_node(train_data,f,mid):               #train_data is parent data, f is particular feature and mid is mid_value by which we get maximum gain ratio  
    data_below=[j for j in train_data if j[f]<mid]    #split the parent data into left child data 
    data_above=[j for j in train_data if j[f]>=mid]   #split the parent data into right child data 
    data_below1=np.array(data_below)
    data_above1=np.array(data_above)
    return data_below1,data_above1

In [153]:
'''This function firstly calculate the total sample if sample is equal to 1 it means it is leaf node and if sample is not equal to 1
then it call gain function which returns each feature gain ratio, select maximum gain ratio and call the split node which splits 
parent data into left child node sample and right child node sample then again call decision tree function for left child node and right 
child node. In the last draw the graph by using DotExporter built in function'''

def decision_tree(train_data,feature,counter=-1,parent=None):
    count=np.unique(train_data[:,-1],return_counts=True)
    counter+=1           
    ent=entropy(count[1])    
    c_z=(train_data[:,-1]==0).sum()
    c_o=(train_data[:,-1]==1).sum()
    c_t=(train_data[:,-1]==2).sum()
    show_list=[c_z,c_o,c_t]
    if len(count[0])==1:               #check total sample value is 1 or not
        parent1=parent      
        child=Node('Entropy = '+str(ent)+' Sample = '+str((count[1]).sum())+' Value ='+str(show_list),parent=parent1)   #Add child into parent node in tree
        print('Level',counter)         # print Level and each information related to leaf node in output 
        for i in range(len(count[0])):
            print('Count of '+ str(int(count[0][i])) + ' = '+str(count[1][i]))
        print('Current Entropy is = '+str(ent))
        print('Reached Leaf Node')
        print()
    else:       
        max_gain=0
        feat=len(feature)
        list_gain=[]
        for f in range(feat):           #call gain function for each feature
            gain=gain_func(train_data,f)
            list_gain.append(gain)
        s_gain=max(list_gain)          # get maximum gain ratio
        
        if parent is None:            #check wheather parent is First or Not
            child=Node(str(feature[s_gain[1]])+ '<'+ str(s_gain[2]) + ' Entropy ='+ str(round(ent,3))+' Sample ='+ str((count[1]).sum()) +' Value='+str(show_list))
            
        else:                         # Add child Node into parent node in tree 
            parent1=parent
            child=Node(str(feature[s_gain[1]])+'<'+str(round(s_gain[2],3))+' Entropy = '+str(round(ent,3))+' Sample ='+str((count[1]).sum())+' Value='+str(show_list),parent=parent1)
        print('Level',counter)        # Print all information of parent Node in Output
        for i in range(len(count[0])):
            print('Count of ',int(count[0][i]) , '= ',str(count[1][i]))
        print('Current Entropy is = '+str(ent))
        print('Splitting on feature '+ str(feature[s_gain[1]]) +' with gain ratio '+str(s_gain[0]))
        print()
        final_split=split_node(train_data,s_gain[1],s_gain[2])    # get left child node and right child node sample      
        decision_tree(final_split[0],feature,counter,child)       # call decision tree for left Child
        decision_tree(final_split[1],feature,counter,child)       # call decision tree for right Child
    
    '''DotExproter function is used to draw tree where child is pick as first parent node.When we execute the code iris_tree.png
    Image will create where we can see tree.'''   
    
    DotExporter(child,nodeattrfunc=lambda node: "fixedsize=False, width=3, height=1, shape=rectangle",edgeattrfunc=lambda parent, child: "style=bold").to_picture("iris_tree.png")  
    
df=datasets.load_iris()
X=df.data
Y=df.target
dat
print()
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=1)
feature=df.feature_names

sh=X_train.shape
train_data=np.insert(X_train,sh[1],Y_train,axis=1)          # Insert Y column in X_train
decision_tree(train_data,feature)                          # Call the decision tree
    
        

Level 0
Count of  0 =  37
Count of  1 =  34
Count of  2 =  41
Current Entropy is = 1.5807197138422102
Splitting on feature petal width (cm) with gain ratio 0.9999999999999999

Level 1
Count of 0 = 37
Current Entropy is = 0.0
Reached Leaf Node

Level 1
Count of  1 =  34
Count of  2 =  41
Current Entropy is = 0.993707106604508
Splitting on feature petal width (cm) with gain ratio 0.6610420198933152

Level 2
Count of  1 =  33
Count of  2 =  4
Current Entropy is = 0.49418293484978865
Splitting on feature petal length (cm) with gain ratio 0.6941833044972409

Level 3
Count of 1 = 32
Current Entropy is = 0.0
Reached Leaf Node

Level 3
Count of  1 =  1
Count of  2 =  4
Current Entropy is = 0.7219280948873623
Splitting on feature petal width (cm) with gain ratio 0.33155970728682876

Level 4
Count of 2 = 3
Current Entropy is = 0.0
Reached Leaf Node

Level 4
Count of  1 =  1
Count of  2 =  1
Current Entropy is = 1.0
Splitting on feature petal length (cm) with gain ratio 1.0

Level 5
Count of 1 = 