In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    "outlook" : ['sunny','sunny','overcast','rainy','rainy','rainy','overcast','sunny','sunny','rainy','sunny','overcast','overcast','rainy'],
    "temp" : ['hot','hot','hot','mild','cool','cool','cool','mild','cool','mild','mild','mild','hot','mild'],
    'humidity' : ['high','high','high','high','normal','normal','normal','high','normal','normal','normal','high','normal','high'],
    'windy' : ['false','true','false','false','false','true','true','false','false','false','true','true','false','true'],
    'play' : ['no','no','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']
}

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [5]:
pd.value_counts(df['play'])

yes    9
no     5
Name: play, dtype: int64

In [6]:
target_values = pd.unique(df['play'])

In [7]:
target_values

array(['no', 'yes'], dtype=object)

In [8]:
# Step-1 : Calculate Entropy of Target
n = df.shape[0] # - total number of rows
entropy = 0
for target in target_values:
    # return count of one target at a time
    count = df[df['play'] == target]['play'].count()
    # -p(c)*log2(p(c))
    entropy += (-count/n) * np.log2(count/n)

In [9]:
entropy

0.9402859586706311

In [10]:
pd.value_counts(df['outlook'])

sunny       5
rainy       5
overcast    4
Name: outlook, dtype: int64

In [11]:
df.groupby('outlook')['play'].value_counts()

outlook   play
overcast  yes     4
rainy     yes     3
          no      2
sunny     no      3
          yes     2
Name: play, dtype: int64

In [12]:
df[df['outlook'] == 'rainy']['play'].value_counts()

yes    3
no     2
Name: play, dtype: int64

In [13]:
# suppose col_name = "outlook"
def calc_info_gain(col_name):
    # variables = ["rainy", "sunny", "overcast"]
    variables = pd.unique(df[col_name])
    info_gain = 0
    
    # var = "rainy"
    for var in variables:
        col_entropy = 0
        # count = df[df['outlook'] == 'rainy']['play'].value_counts()
        count = df[df[col_name] == var]['play'].value_counts()
        total = sum(count)
        
        # var_count = 3
        for var_count in count:
            col_entropy += (-var_count/total) * np.log2(var_count/total)
        
        info_gain += col_entropy * (total/n)
    return info_gain

In [14]:
calc_info_gain('outlook')

0.6935361388961918

In [15]:
calc_info_gain('temp')

0.9110633930116763

In [16]:
calc_info_gain('humidity')

0.7884504573082896

In [17]:
calc_info_gain('windy')

0.8921589282623617

In [18]:
columns = df.columns
scores = {}
for i in range(len(columns) - 1):
    info_gain = calc_info_gain(columns[i])
    gain = entropy - info_gain
    scores[columns[i]] = gain

In [19]:
scores

{'outlook': 0.24674981977443933,
 'temp': 0.02922256565895487,
 'humidity': 0.15183550136234159,
 'windy': 0.04812703040826949}

In [20]:
max(scores, key=scores.get)

'outlook'