In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    "outlook" : ['sunny','sunny','overcast','rainy','rainy','rainy','overcast','sunny','sunny','rainy','sunny','overcast','overcast','rainy'],
    "temp" : ['hot','hot','hot','mild','cool','cool','cool','mild','cool','mild','mild','mild','hot','mild'],
    'humidity' : ['high','high','high','high','normal','normal','normal','high','normal','normal','normal','high','normal','high'],
    'windy' : ['false','true','false','false','false','true','true','false','false','false','true','true','false','true'],
    'play' : ['no','no','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']
}

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [5]:
pd.value_counts(df['play'])

yes    9
no     5
Name: play, dtype: int64

In [8]:
pd.value_counts(df['play'])[0]

9

In [9]:
target_value = pd.unique(df['play'])
target_value

array(['no', 'yes'], dtype=object)

In [13]:
n = df.shape[0]
entropy = 0
for target in target_value:
    count = df[df['play'] == target]['play'].count()
    entropy += (-count / n) * np.log2(count/n)

In [14]:
entropy

0.9402859586706311

In [15]:
pd.value_counts(df['outlook'])

sunny       5
rainy       5
overcast    4
Name: outlook, dtype: int64

In [23]:
df.groupby('outlook')['play'].value_counts()

outlook   play
overcast  yes     4
rainy     yes     3
          no      2
sunny     no      3
          yes     2
Name: play, dtype: int64

In [37]:
def calc_info_gain(col_name):
    variables = pd.unique(df[col_name])
    info_gain = 0
    for var in variables:
        col_entropy = 0
        count = df[df[col_name] == var]['play'].value_counts()
        total = sum(count)
        for var_count in count:
            col_entropy += (-var_count/total) * np.log2(var_count/total)
        info_gain += col_entropy * (total/n)
    #print(info_gain)
    return info_gain

In [38]:
col_names = df.columns

In [39]:
col_names

Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')

In [42]:
scores = {}
for i in range(len(col_names) - 1):
    info_gain = calc_info_gain(col_names[i])
    gain = entropy - info_gain
    scores[col_names[i]] = gain

In [43]:
scores

{'outlook': 0.24674981977443933,
 'temp': 0.02922256565895487,
 'humidity': 0.15183550136234159,
 'windy': 0.04812703040826949}

In [44]:
max(scores, key=scores.get)

'outlook'