In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    "outlook" : ['sunny','sunny','overcast','rainy','rainy','rainy','overcast','sunny','sunny','rainy','sunny','overcast','overcast','rainy'],
    "temp" : ['hot','hot','hot','mild','cool','cool','cool','mild','cool','mild','mild','mild','hot','mild'],
    'humidity' : ['high','high','high','high','normal','normal','normal','high','normal','normal','normal','high','normal','high'],
    'windy' : ['false','true','false','false','false','true','true','false','false','false','true','true','false','true'],
    'play' : ['no','no','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']
}

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [5]:
pd.value_counts(df['play'])

yes    9
no     5
Name: play, dtype: int64

In [6]:
df['play'].value_counts()['no']

5

In [7]:
values = pd.unique(df['play'])
entropy = 0
n = len(df)
for value in values:
    p_of_c = df['play'].value_counts()[value] / n
    entropy += -p_of_c * np.log2(p_of_c)

In [8]:
entropy

0.9402859586706311

In [9]:
df['outlook'].unique()

array(['sunny', 'overcast', 'rainy'], dtype=object)

In [10]:
df['outlook'].value_counts()

sunny       5
rainy       5
overcast    4
Name: outlook, dtype: int64

In [11]:
df.groupby('outlook')['play'].value_counts()

outlook   play
overcast  yes     4
rainy     yes     3
          no      2
sunny     no      3
          yes     2
Name: play, dtype: int64

In [13]:
df[df['outlook'] == 'rainy']

Unnamed: 0,outlook,temp,humidity,windy,play
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
9,rainy,mild,normal,False,yes
13,rainy,mild,high,True,no


In [17]:
df[df['outlook'] == 'rainy'][df['play'] == 'yes']

  df[df['outlook'] == 'rainy'][df['play'] == 'yes']


Unnamed: 0,outlook,temp,humidity,windy,play
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [18]:
df[df['outlook'] == 'rainy'][df['play'] == 'no']

  df[df['outlook'] == 'rainy'][df['play'] == 'no']


Unnamed: 0,outlook,temp,humidity,windy,play
5,rainy,cool,normal,True,no
13,rainy,mild,high,True,no


In [27]:
def calc_avg(df, feature):
    variables = pd.unique(df[feature])
    target_values = pd.unique(df['play'])
    avg = 0
    for var in variables:
        entropy = 0
        for target in target_values:
            x = len(df[df[feature] == var][df['play'] == target][feature])
            y = len(df[df[feature] == var][feature])
            e = x / (y + np.finfo(float).eps)
            entropy += -e * np.log2(e + np.finfo(float).eps)
        attr = y / len(df)
        avg += attr * entropy
    return avg

In [28]:
calc_avg(df, 'outlook')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.6935361388961914

In [29]:
# np.finfo(float).eps

In [30]:
calc_avg(df, 'humidity')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.7884504573082889

In [31]:
calc_avg(df, 'windy')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.892158928262361

In [32]:
entropy - calc_avg(df, 'outlook')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.24674981977443977

In [33]:
entropy_attrs = {}
for i in range(len(df.columns) - 1):
    entropy_attrs[df.columns[i]] = calc_avg(df, df.columns[i])

  x = len(df[df[feature] == var][df['play'] == target][feature])


In [34]:
entropy_attrs

{'outlook': 0.6935361388961914,
 'temp': 0.9110633930116756,
 'humidity': 0.7884504573082889,
 'windy': 0.892158928262361}

In [35]:
gain = {}
for key in entropy_attrs:
    gain[key] = entropy - entropy_attrs[key]

In [36]:
gain

{'outlook': 0.24674981977443977,
 'temp': 0.029222565658955535,
 'humidity': 0.15183550136234225,
 'windy': 0.048127030408270155}