# Loading data 

In [1]:
import pandas as pd

data= {'outlook':['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny','Rainy','Sunny','Overcast','Overcast','Rainy'],
'temp':['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild'],
'humidity':['High','High','High','High','Normal','Normal','Normal','High','Normal','Normal','Normal','High','Normal','High'],
'windy':['False','True','False','False','False','True','True','False','False','False','True','True','False','True'],
'play':['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']}

df= pd.DataFrame(data)
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,Sunny,Hot,High,False,No
1,Sunny,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Rainy,Mild,High,False,Yes
4,Rainy,Cool,Normal,False,Yes
5,Rainy,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Sunny,Mild,High,False,No
8,Sunny,Cool,Normal,False,Yes
9,Rainy,Mild,Normal,False,Yes


# Calculating entropy of target class 'play'

In [2]:
import math

def entropy_target():
    p= df.loc[(df['play']=='Yes')]['play'].count()
    n= df.loc[(df['play']=='No')]['play'].count()
    total= df['play'].count()
    
    e= -((p/total)*math.log2((p/total)))-((n/total)*math.log2((n/total)))
    
    return e



E_play= entropy_target()
E_play


0.9402859586706311

# Entropy of other attributes after split

### E(play, outlook)= P(Sunny)*E(Sunny) +  P(Overcast)*E(Overcast) +  P(Rainy)*E(Rainy)

### E(play, temp)= P(Hot)*E(Hot) +  P(Cool)*E(Cool) +  P(Mild)*E(Mild)

### E(play, humidity)= P(High)*E(High) +  P(Normal)*E(Normal)

### E(play, windy)= P(True)*E(True) +  P(False)*E(False)

In [3]:
def _count(feature,value):
    c= df.loc[(df[feature]==value)][feature].count()
    return c

def probability(feature,value):
    c= _count(feature,value)
    total= df[feature].count()
    prob= c/total
    return prob

def entropy_attribute(attribute,value):
    p= df.loc[(df[attribute]==value) & (df['play']=='Yes')]['play'].count()
    n= df.loc[(df[attribute]==value) & (df['play']=='No')]['play'].count()
    total= df.loc[df[attribute]==value][attribute].count()
    
    if p==0 or n==0:
        e=0
    else:
        e= -((p/total)*math.log2((p/total)))-((n/total)*math.log2((n/total)))
    
    
    return e

E_outlook= probability('outlook','Sunny')*entropy_attribute('outlook','Sunny') + probability('outlook','Overcast')*entropy_attribute('outlook','Overcast') + probability('outlook','Rainy')*entropy_attribute('outlook','Rainy')
E_temp= probability('temp','Hot')*entropy_attribute('temp','Hot') + probability('temp','Mild')*entropy_attribute('temp','Mild') + probability('temp','Cool')*entropy_attribute('temp','Cool')
E_humidity= probability('humidity','High')*entropy_attribute('humidity','High') + probability('humidity','Normal')*entropy_attribute('humidity','Normal')
E_windy= probability('windy','True')*entropy_attribute('windy','True') + probability('windy','False')*entropy_attribute('windy','False')

print(E_outlook,E_temp,E_humidity,E_windy)

0.6935361388961918 0.9110633930116763 0.7884504573082896 0.8921589282623617


# Calculating information gain of each attribute

### Gain(play, outlook) = E(play) – E(play, outlook)

### Gain(play, temp) = E(play) – E(play, temp)

### Gain(play, humidity) = E(play) – E(play, humidity)

### Gain(play, windy) = E(play) – E(play, windy)

In [4]:
gain_outlook= E_play - E_outlook
gain_temp= E_play - E_temp
gain_humidity= E_play - E_humidity
gain_windy= E_play - E_windy
gain= {gain_outlook:'outlook',gain_temp:'temp',gain_humidity:'humidity',gain_windy:'windy'}
gain

{0.24674981977443933: 'outlook',
 0.02922256565895487: 'temp',
 0.15183550136234159: 'humidity',
 0.04812703040826949: 'windy'}

# The attribute with the largest information gain is used for the split.

In [5]:
print("Splliting attribute at first level is : ",gain[max(gain)])

Splliting attribute at first level is :  outlook
