In [53]:
from sklearn import datasets
import pandas as pd
from math import log2

In [54]:
iris = datasets.load_iris()

In [55]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [56]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [57]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,c,b,c,d
146,6.3,2.5,5.0,1.9,c,a,c,d
147,6.5,3.0,5.2,2.0,c,b,c,d
148,6.2,3.4,5.4,2.3,c,c,d,d


In [58]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [59]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [60]:
df

Unnamed: 0,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,b,c,a,a
1,a,b,a,a
2,a,c,a,a
3,a,c,a,a
4,a,c,a,a
...,...,...,...,...
145,c,b,c,d
146,c,a,c,d
147,c,b,c,d
148,c,c,d,d


In [61]:
def calculate_entropy(Y):
    
    Classes=set(Y[0])
    entropy=0
    total_rows=len(Y)
    
    for cl in Classes:
        
        rows_of_cl=(Y[0]==cl).sum()
        probability=rows_of_cl/total_rows
        
        entropy+=-(probability*log2(probability))
    
    return entropy

In [62]:
class Tree:
    
    def __init__(self):
        self.current_entropy =None 
        self.children =[]
        self.target_values=[]
        self.target_values_rows=[]
        self.leaf_node=False
        self.gain_ratio=None
        self.best_feature=None
    

In [63]:
def print_tree(root,level):
     
    print("Level",level)
    
    for i in range(len(root.target_values)):
        print("Count of",root.target_values[i],"=",root.target_values_rows[i])
    
    
    print("Current Entropy  is =",root.current_entropy)
    
    
    if(root.leaf_node==True):
        print("Reached leaf Node")
        return 
    
    if(root.best_feature==None):
        return 
    
    print("Splitting on feature",root.best_feature,"with gain ratio",root.gain_ratio)
    
    for child in root.children:
        
        print()
        print_tree(child,level+1)

In [70]:
def build_tree(df, y, unused_features,root):
    
    total_rows=y.shape[0]
    current_entropy=calculate_entropy(y)
    
    
    for Class in set(y[0]):
        
        totals_rows_of_Class=(y[0]==Class).sum()
        
        root.target_values.append( Class )
        root.target_values_rows.append( totals_rows_of_Class)
    
    root.current_entropy=current_entropy
   
  
    if(len(root.target_values_rows)==1):
        root.leaf_node=True
        return
    
    if(len(unused_features)==0):
        return 
    
    
    best_feature =""
    best_gain_ratio=0
    
    for f in unused_features:
        
        possible_values = set(df[f])
        
        split_index=0
        entropy=0
        
        for value in possible_values:
            
            rows=(df[f]==value)
            entropy_after_split=calculate_entropy(y[rows])
            
            probability=rows.sum()/total_rows
            
            
            entropy+=probability*entropy_after_split
            split_index+=-probability*log2(probability)
        
        gain_ratio=( current_entropy-entropy )/split_index
        
        
        if(gain_ratio>best_gain_ratio):
            
            best_gain_ratio=gain_ratio
            best_feature=f
    
    if(best_feature==""):
        return
    
    root.gain_ratio=best_gain_ratio
    root.best_feature=best_feature
    
    new_unused_features=list(unused_features)
    new_unused_features.remove(best_feature)
    
    for value in sorted(set(df[best_feature])):
        
        child=Tree()
        root.children.append(child)
        build_tree(df[df[best_feature]==value],y[df[best_feature]==value],new_unused_features,child)
        
    


In [71]:
y = pd.DataFrame(iris.target)
unused_features =list(set(df.columns))
root=Tree()
build_tree(df, y, unused_features,root)
print_tree(root,0)

Level 0
Count of 0 = 50
Count of 1 = 50
Count of 2 = 50
Current Entropy  is = 1.584962500721156
Splitting on feature pw_labeled with gain ratio 0.6996382036222091

Level 1
Count of 0 = 50
Current Entropy  is = 0.0
Reached leaf Node

Level 1
Count of 1 = 10
Current Entropy  is = 0.0
Reached leaf Node

Level 1
Count of 1 = 40
Count of 2 = 16
Current Entropy  is = 0.863120568566631
Splitting on feature pl_labeled with gain ratio 0.4334099495621067

Level 2
Count of 1 = 1
Current Entropy  is = 0.0
Reached leaf Node

Level 2
Count of 1 = 39
Count of 2 = 8
Current Entropy  is = 0.6581912658132185
Splitting on feature sl_labeled with gain ratio 0.12674503775809332

Level 3
Count of 2 = 1
Current Entropy  is = 0.0
Reached leaf Node

Level 3
Count of 1 = 14
Current Entropy  is = 0.0
Reached leaf Node

Level 3
Count of 1 = 23
Count of 2 = 7
Current Entropy  is = 0.783776947484701
Splitting on feature sw_labeled with gain ratio 0.07092036405148876

Level 4
Count of 1 = 3
Count of 2 = 1
Current En