### Importing libraries

In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np



### Loading Data 

In [3]:
iris = datasets.load_iris()

In [4]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [5]:
y = pd.DataFrame(iris.target)
y.columns = ["label"]

In [6]:
df.head()

Unnamed: 0,sl,sw,pl,pw
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Defining different functions needed

### Entropy function

In [7]:
#calculating entropy
#the y passed here is the y that is particular to that label of that feature

def entropy(y):
    #gives the set of classes present in the passed y 
    classes = set(y.values.flatten())
    value = 0
    # calculating entropy by iterating over for loop
    for i in classes:
        p = len(y[y[y.columns[0]]==i])/len(y)
        value -= (p*(np.log2(p)))
    return value
    

### Calculating Labels

In [8]:
#gives us the list of all the possible values on which split can be performed on the feature
def labels(df,s_f):
    k = set(df[s_f])
    k=sorted(k)
    l = []
    for i in range(len(k)-1):
        v =(k[i]+k[i+1])/2
        l.append(v)
    return l


### Finding out the gain ratio

In [9]:
#defining gain function to calculate gain
#s_f = selected_feature
def gain_ratio(df,y,s_f):
    entropy_before_split = entropy(y)
    #initializing
    list_gain_ratio = -100000
    #setting initial value
    value=0
    
    #iterating over each label to find the label that has max gain ratio
    #which is to be selected for the split of feature
    for j in labels(df,s_f):
        entropy_after_split = 0
        split_info = 0
        #label_set is the values that new_y_node can take
        label_set = [(y[df[s_f]<j]),(y[df[s_f]>=j])]
        for i in label_set:
            new_y_node = i
            weights = len(new_y_node)/len(y)
            #putting a check on weights
            if weights!=0 :
                entropy_after_split += weights*entropy(new_y_node)
                split_info -= weights*(np.log(weights))
        gain = entropy_before_split - entropy_after_split
        gain_ratio = gain/split_info
        #condition to get max gain_ratio and corresponding label on which split occured
        if gain_ratio>list_gain_ratio :
            list_gain_ratio=gain_ratio
            value=j

    return list_gain_ratio,value

## Build Tree Function that prints all the nodes of Decison Tree

In [11]:
def build_tree(df, y, unused_features,level):
    #base case
    # 1. unused is empty
    # 2. y contains only one distinct value
    if len(unused_features)==0 or set(y.values.flatten())==1:
        #printing the level of the tree
        print("Level ",level)
        print('This is leaf node')
        # printing the constituents classes
        for i in set(y.values.flatten()):
            print(f"the count of class {i} = {len(y[y[y.columns[0]]==i])}")
        #printing entropy
        print("Current entropy is ",entropy(y) )
        print()
        return
    #defining gain_list and label list 
    gain_list = []
    j1 = []
    # iterating over all the features to choose the best feature for split
    for f in unused_features:
        new_gain,j = gain_ratio(df,y,f)
        gain_list.append(new_gain)
        j1.append(j)
    #selecting the best feature i.e with max gain ratio
    # and the corresponding value i.e label of feature on which split is done
    s_f = (list(unused_features))[gain_list.index(max(gain_list))]
    value = j1[gain_list.index(max(gain_list))]
    
    
    #printing level
    print("level is",level)
    #printing count of class variables 
    for i in set(y.values.flatten()):
            print(f"the count of class {i} = {len(y[y[y.columns[0]]==i])}")
    #printing entropy
    print("Current entropy is ",entropy(y) )
    #printing gain ratio and selected feature
    print(f"Splitting on feature {s_f} with gain ratio {max(gain_list)}")
    print()
    # removing best feature from unused features
    unused_features.remove(s_f)
    
    # ASSIGNING NEW VALUES TO TRAIN AND TEST DATA
    
    new_y1 = y[df[s_f]>value]
    y1 = new_y1
    new_df1 = df[df[s_f]>value]
    df1 = new_df1
    
    
    new_y2 = y[df[s_f]<value]
    y2 = new_y2
    new_df2 = df[df[s_f]<value]
    df2 = new_df2
    
    # CALLING RECURSION 2 TIMES AS IT IS A BINARY SPLIT
    build_tree(df1,y1,unused_features,level+1)
    build_tree(df2,y2,unused_features,level+1)
    
    
    
    

# The decison Tree

In [12]:
y = pd.DataFrame(iris.target)
unused_features = set(df.columns)

build_tree(df, y, unused_features,level=0)


level is 0
the count of class 0 = 50
the count of class 1 = 50
the count of class 2 = 50
Current entropy is  1.584962500721156
Splitting on feature pl with gain ratio 1.4426950408889634

level is 1
the count of class 1 = 50
the count of class 2 = 50
Current entropy is  1.0
Splitting on feature pw with gain ratio 1.0003139564545946

level is 2
the count of class 1 = 1
the count of class 2 = 45
Current entropy is  0.15109697051711368
Splitting on feature sl with gain ratio 0.1431777747140423

level is 3
the count of class 2 = 39
Current entropy is  0.0
Splitting on feature sw with gain ratio 0.0

Level  4
This is leaf node
the count of class 2 = 37
Current entropy is  0.0

Level  4
This is leaf node
the count of class 2 = 2
Current entropy is  0.0

Level  3
This is leaf node
the count of class 1 = 1
the count of class 2 = 6
Current entropy is  0.5916727785823275

Level  2
This is leaf node
the count of class 1 = 49
the count of class 2 = 5
Current entropy is  0.44506485705083865

Level  