In [26]:
from sklearn import datasets
import pandas as pd
import math

In [6]:
iris = datasets.load_iris()

In [7]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [8]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [9]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,c,b,c,d
146,6.3,2.5,5.0,1.9,c,a,c,d
147,6.5,3.0,5.2,2.0,c,b,c,d
148,6.2,3.4,5.4,2.3,c,c,d,d


In [10]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [11]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [219]:
def build_tree(df, y, unused_features,level):
    #base case
    
    print('Level',level)
    prev_entropy=entropy(df,y)
    count_hash=y[0].value_counts()
    for val in count_hash.index:
        print('Count of',val,'=',count_hash[val])
    print('current entropy is ',prev_entropy)
    
    # 1. unused is empty
    if len(unused_features)==0:
        print('reached leaf node')
        print()
        return
    
    # 2. y contains only one distinct value
    if len(set(y[0]))==1:
        print('reached leaf node')
        print()
        return 
    
    

    best_feature = ""
    
    max_gain_ratio=-10
    for f in unused_features:
        possible_values = set(df[f])
        
        ent=0
        split_info=0
        
        # loop over possible values : val
        for val in possible_values:
            
        # find subset of df & y with f == val
            df_subset = df[df[f]==val]
            y_subset = y[df[f]==val]
            
        # find number of mistakes in this subset 
        # if we predict the most common y as the output
        # find sum of all these mistakes
            ent=ent+(len(df_subset)/len(df))*entropy(df_subset,y_subset)
            
            split_info=split_info+(-1)*(len(df_subset)/len(df))*math.log(len(df_subset)/len(df),2)
            
        
        
        info_gain=prev_entropy-ent
        
        
        if split_info!=0:
            gain_ratio=info_gain/split_info
        else:
            gain_ratio=info_gain
        
        # update best feature so that that particular feature
        # makes least number of mistakes
        if(max_gain_ratio<gain_ratio):
            max_gain_ratio=gain_ratio
            best_feature=f
            
    # here you should know the best feature
    # print it out
    print("Splitting on feature", best_feature,'with Gain Ratio')
    print(max_gain_ratio)
    print()
    
    # remove best feature from unused features
    unused_features.discard(best_feature)
    
    # loop over possible values of best feature
    for val in set(df[best_feature]):
        # call build tree recursively
        build_tree(df[df[best_feature]==val],y[df[best_feature]==val],unused_features,level+1)


In [209]:
def entropy(df,y):
    ent = 0
    
    for val in set(y[0]):
        df_length=len(df)
        y_length=len(y[y[0]==val])
        ent=ent+(-1)*(len(y[y[0]==val])/len(df))*(math.log(len(y[y[0]==val])/len(df),2))
    
    
    return ent

In [220]:
y = pd.DataFrame(iris.target)
unused_features = set(df.columns)
build_tree(df, y, unused_features,0)

Level 0
Count of 2 = 50
Count of 1 = 50
Count of 0 = 50
current entropy is  1.584962500721156
Splitting on feature pw_labeled with Gain Ratio
0.699638203622209

Level 1
Count of 2 = 34
current entropy is  0.0
reached leaf node

Level 1
Count of 0 = 50
current entropy is  0.0
reached leaf node

Level 1
Count of 1 = 40
Count of 2 = 16
current entropy is  0.863120568566631
Splitting on feature pl_labeled with Gain Ratio
0.4334099495621066

Level 2
Count of 2 = 8
current entropy is  0.0
reached leaf node

Level 2
Count of 1 = 1
current entropy is  0.0
reached leaf node

Level 2
Count of 1 = 39
Count of 2 = 8
current entropy is  0.6581912658132185
Splitting on feature sl_labeled with Gain Ratio
0.12674503775809332

Level 3
Count of 1 = 23
Count of 2 = 7
current entropy is  0.783776947484701
Splitting on feature sw_labeled with Gain Ratio
0.07092036405148876

Level 4
Count of 1 = 14
Count of 2 = 6
current entropy is  0.8812908992306927
reached leaf node

Level 4
Count of 1 = 6
current entrop