In [3]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [76]:
eps = np.finfo(float).eps

In [4]:
iris = datasets.load_iris()

In [5]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [6]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [7]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
5,5.4,3.9,1.7,0.4,b,d,a,a
6,4.6,3.4,1.4,0.3,a,c,a,a
7,5.0,3.4,1.5,0.2,a,c,a,a
8,4.4,2.9,1.4,0.2,a,b,a,a
9,4.9,3.1,1.5,0.1,a,c,a,a


In [8]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [9]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [10]:
y=pd.DataFrame(iris.target)
y.columns=["Class"]
y[df.sl_labeled == 'a']

3

In [11]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [12]:
(df.sl_labeled == 'a').sum()
rows = (y.Class == 0)
len(df[y.Class == 0])

50

In [15]:
df.sl_labeled.unique().tolist()

['b', 'a', 'd', 'c']

In [29]:
def unique_attribute(sf):
    features = df.columns
    for f in features:
        if f == sf:
            print(df[f].unique().tolist())

In [31]:
for f in set(df.columns):
    unique_attribute(f)

['b', 'a', 'd', 'c']
['c', 'b', 'd', 'a']
['a', 'c', 'b', 'd']
['a', 'c', 'b', 'd']


In [120]:
set(iris.target)

{0, 1, 2}

In [126]:
def print_tree(df,y,level,gain = 0 ,split_feature="",feat_finished = False,leaf = False):
    #Here the print_tree function will print the level,entropy,gain and other values
    #Leaf node reached
    ent = 0.0
    print("Level:",level)
    classes = set(iris.target) #Set containing the class(0,1,2)
        
    for cl in classes:
        count_cl = (y.Class == cl).sum() #Calculating the count for each class 0,1,2
        print("Count of Class:" ,cl ,"=",count_cl) #printing the count of each class for that node
    
    if leaf == True: #Case with leaf node reached
        ent=0.0
        print("Current Entropy =",ent)
        print("Reached Leaf Node")
    elif leaf == False and feat_finished == False: #Case with neither leaf node and features to split upon remaining
        ent = entropy(df,y)
        print("Current Entropy =",ent)
        print("Splitting on feature ",str(split_feature),"with gain =",gain)
    else: #Case when all features are finished
        ent = entropy(df,y)
        print("Current Entropy =",ent)
    
    print("")


In [127]:
def entropy(df,y):
    cvalues = set(y["Class"]) #possible values of classes
    clist = [] #empty list for storing probabilities
    total = df.shape[0] #the number of rows for the particular 
    
    for cl in cvalues:
        p = len(df[y.Class == cl])/total #calculating the probability for each class,Eg- for a1 class
        clist.append(p)
    
    entropy = 0.0 #initially zero
    for pr in clist:
        entropy += (-1.0*(pr+eps)*np.log(pr+eps)) #calculating the entropy by iterating over all distinct class values
        #eps is added to avoid log(0) error while calculation
        
    return entropy

In [128]:
def calc_gain(df,y,sf):
    #feature_values = df[sf].unique().tolist() #Distinct values for the feature
    for f in df.columns:
        if sf == f:
            feature_values = df[sf].unique().tolist()
            #print(feature_values)
            
    frac_list = [] #storing the the fractions for each value for calculating weighted average
    entropy_list = [] #storing the entropy for each 
    
    total = df.shape[0]
    
    for val in feature_values:
        val_y_rows = df[df[sf] == val] #selecting rows with specific value for the feature
        e_val = entropy(val_y_rows,y) #entropy for the particular value
        entropy_list.append(e_val) #Adding the entropy for particular attribute to list
        frac = (val_y_rows.shape[0])/total #fraction for weighted average 
        frac_list.append(frac) #adding the fraction for calculating weighted average of that attribute
    
    weighted_entropy = 0.0
    for i in range(0,len(frac_list)):
        weighted_entropy += frac_list[i]*entropy_list[i] #calculating the entropy according to weighted average
    
    initial_entropy = entropy(df,y)
    gain = initial_entropy - weighted_entropy
    return gain

In [129]:
def build_tree(df, y, unused_features,i):
    #base case
    # 1. unused is empty
    if(len(unused_features) == 0):
        print_tree(df,y,feat_finished = True,level=i)
        return
    # 2. y contains only one distinct value
    if(len(set(y["Class"])) == 1):
        print_tree(df,y,leaf=True,level = i)
        return
    
    max_gain = 0
    best_feature = 0
    for f in unused_features:
        gain = calc_gain(df,y,f)
        
        if gain >= max_gain:
            max_gain = gain
            best_feature = f
        # loop over possible values : val find subset of df & y with f == val # find number of mistakes in this subset 
        # if we predict the most common y as the output # find sum of all these mistakes
        # update best feature so that that particular feature # makes least number of mistakes
        
    # here you should know the best feature
    print("Best Feature: ", best_feature)
    #printing out the tree level and other information
    print_tree(df,y,gain=max_gain,split_feature=best_feature,level = i)
    
    unused_features.remove(best_feature) # remove best feature from unused features
    unique_labels_f = df[best_feature].unique().tolist() #Distinct labels for the best feature
    for label in unique_labels_f: # loop over possible values of best feature
        print(label)
        label_rows = df[df[best_feature] == label] #Particular label rows only
        y_label = y[df[best_feature] == label] #Particular y rows with label
        build_tree(label_rows,y_label,unused_features,i+1) # call build tree recursively
        

In [130]:
unused_features = set(df.columns)
build_tree(df, y, unused_features,0)

  import sys


Best Feature:  pw_labeled
Level: 0
Count of Class: 0 = 50
Count of Class: 1 = 50
Count of Class: 2 = 50
Current Entropy = 1.0986122886681098
Splitting on feature  pw_labeled with gain = 0.8752583089296009

a
Level: 1
Count of Class: 0 = 50
Count of Class: 1 = 0
Count of Class: 2 = 0
Current Entropy = 0.0
Reached Leaf Node

c
Best Feature:  pl_labeled
Level: 1
Count of Class: 0 = 0
Count of Class: 1 = 40
Count of Class: 2 = 16
Current Entropy = 0.5982695885852571
Splitting on feature  pl_labeled with gain = 0.21536778951600843

c
Best Feature:  sl_labeled
Level: 2
Count of Class: 0 = 0
Count of Class: 1 = 39
Count of Class: 2 = 8
Current Entropy = 0.45622342016761397
Splitting on feature  sl_labeled with gain = 0.10945355973980214

d
Level: 3
Count of Class: 0 = 0
Count of Class: 1 = 2
Count of Class: 2 = 0
Current Entropy = 0.0
Reached Leaf Node

c
Best Feature:  sw_labeled
Level: 3
Count of Class: 0 = 0
Count of Class: 1 = 23
Count of Class: 2 = 7
Current Entropy = 0.5432727813369007


In [119]:
type(unused_features)
unused_features

set()