In [None]:
#modules that are required in this notebook

from sklearn import datasets
import pandas as pd
from math import log2

In [None]:
# loading iris dataset
iris = datasets.load_iris()

In [None]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [None]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [None]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

In [None]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [None]:
set(df['sl_labeled'])

In [None]:
df

In [None]:
# function -: calculates entropy of the data 

def calculate_entropy(Y):
    
    Classes=set(Y[0])
    entropy=0
    total_rows=len(Y)
    
    for cl in Classes:
        
        rows_of_cl=(Y[0]==cl).sum()
        probability=rows_of_cl/total_rows
        
        entropy+=-(probability*log2(probability))
    
    return entropy

In [None]:
# Tree class-:
#   current_entropy - stores the current entropy of Node          
#   children - stores children of the Node
#   target_values- stores a list of target classes
#   target_values_rows - stores number of rows of target classes
#   leaf_node - stores true if node is leaf node
#   gain_ratio - stores gain ratio of feature at which we are splitting
#   best_feature - stores feature at which dataset is splitting

class Tree:
    
    def __init__(self):
        self.current_entropy =None 
        self.children =[]
        self.target_values=[]
        self.target_values_rows=[]
        self.leaf_node=False
        self.gain_ratio=None
        self.best_feature=None    

In [None]:
def print_tree(root,level):
    
    # the level of node
    
    print("Level",level)
    
    # printing count of target classes
    
    for i in range(len(root.target_values)):
        print("Count of",root.target_values[i],"=",root.target_values_rows[i])
    
    
    # printing current entropy of the node
    
    print("Current Entropy  is =",root.current_entropy)
    
    # checking if node is a leaf node
    
    if(root.leaf_node==True):
        print("Reached leaf Node")
        return 

    # checking if best features exist to split upon
    
    if(root.best_feature==None):
        return 
    
    # printing gain ratio of the feature at which we are splitting
    
    print("Splitting on feature",root.best_feature,"with gain ratio",root.gain_ratio)
    
    # recursively calling children of the node
    
    for child in root.children:
        
        print()
        print_tree(child,level+1)

In [None]:
def build_tree(df, y, unused_features,root):
    
    total_rows=y.shape[0]
    current_entropy=calculate_entropy(y)
    
    # storing count of target classes of node
    
    for Class in set(y[0]):
        
        totals_rows_of_Class=(y[0]==Class).sum()
        
        root.target_values.append( Class )
        root.target_values_rows.append( totals_rows_of_Class)
    
    # storing current entropy of the node
    
    root.current_entropy=current_entropy
   
    # Case : if node is a leaf node
  
    if(len(root.target_values_rows)==1):
        root.leaf_node=True
        return
    
    # Case : if no features are available to split upon
    
    if(len(unused_features)==0):
        return 
    
    
    best_feature =""
    best_gain_ratio=0
    
    # finding best feature to split upon
    # splitting is base on gain ratio of the feature
    
    for f in unused_features:
        
        possible_values = set(df[f])
        
        split_index=0
        entropy=0
        
        # find gain ratio of feature
        
        for value in possible_values:
            
            rows=(df[f]==value)
            entropy_after_split=calculate_entropy(y[rows])
            
            probability=rows.sum()/total_rows
            
            
            entropy+=probability*entropy_after_split
            split_index+=-probability*log2(probability)
        
        gain_ratio=( current_entropy-entropy )/split_index
        
        # comparing gain_ratio with best_split_ratio
        
        if(gain_ratio>best_gain_ratio):
            
            best_gain_ratio=gain_ratio
            best_feature=f
    
    #Case -: if no feature is found to split upon
    
    if(best_feature==""):
        return
    
    # storing best_gain_ratio and best_feature of node
    
    root.gain_ratio=best_gain_ratio
    root.best_feature=best_feature
    
    new_unused_features=list(unused_features)
    new_unused_features.remove(best_feature)
    
    # recursively splitting the dataset on best_feature
    
    for value in sorted(set(df[best_feature])):
        
        child=Tree()
        root.children.append(child)
        build_tree(df[df[best_feature]==value],y[df[best_feature]==value],new_unused_features,child)
        
    


In [None]:
y = pd.DataFrame(iris.target)
unused_features =list(set(df.columns))

# creating root node of tree
root=Tree()

# building the Tree
build_tree(df, y, unused_features,root)

# printing the tree
print_tree(root,0)