In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    "outlook" : ['sunny','sunny','overcast','rainy','rainy','rainy','overcast','sunny','sunny','rainy','sunny','overcast','overcast','rainy'],
    "temp" : ['hot','hot','hot','mild','cool','cool','cool','mild','cool','mild','mild','mild','hot','mild'],
    'humidity' : ['high','high','high','high','normal','normal','normal','high','normal','normal','normal','high','normal','high'],
    'windy' : ['false','true','false','false','false','true','true','false','false','false','true','true','false','true'],
    'play' : ['no','no','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']
}

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [5]:
df.value_counts('play')

play
yes    9
no     5
dtype: int64

In [6]:
pd.unique(df['play'])

array(['no', 'yes'], dtype=object)

In [7]:
values = pd.unique(df['play'])
entropy = 0
n = len(df)
for value in values:
    x = df['play'].value_counts()[value] / n
    entropy += -x * np.log2(x)

In [8]:
entropy

0.9402859586706311

In [9]:
df['outlook'].unique()

array(['sunny', 'overcast', 'rainy'], dtype=object)

In [10]:
df['outlook'].value_counts()

rainy       5
sunny       5
overcast    4
Name: outlook, dtype: int64

In [11]:
df.groupby('outlook')['play'].value_counts()

outlook   play
overcast  yes     4
rainy     yes     3
          no      2
sunny     no      3
          yes     2
Name: play, dtype: int64

In [12]:
df[df['outlook'] == 'rainy']

Unnamed: 0,outlook,temp,humidity,windy,play
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
9,rainy,mild,normal,False,yes
13,rainy,mild,high,True,no


In [14]:
df[df['outlook'] == 'rainy'][df['play'] == 'no']['outlook']

  df[df['outlook'] == 'rainy'][df['play'] == 'no']['outlook']


5     rainy
13    rainy
Name: outlook, dtype: object

In [17]:
def calc_avg(df, feature):
    target_values = pd.unique(df['play'])
    variables = df[feature].unique()
    avg = 0
    for var in variables:
        entropy = 0
        for target in target_values:
            x = len(df[df[feature] == var][df['play'] == target][feature])
            y = len(df[df[feature] == var][feature])
            e = x / (y + np.finfo(float).eps)
            entropy += -e * np.log2(e + np.finfo(float).eps)
        attr = y / len(df)
        avg += attr * entropy
    return avg

In [18]:
calc_avg(df, 'outlook')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.6935361388961914

In [19]:
calc_avg(df, 'temp')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.9110633930116756

In [20]:
calc_avg(df, 'windy')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.892158928262361

In [21]:
calc_avg(df, 'humidity')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.7884504573082889

In [22]:
entropy - calc_avg(df, 'outlook')

  x = len(df[df[feature] == var][df['play'] == target][feature])


0.24674981977443977

In [23]:
entropy_attrs = {}
for i in range(len(df.columns) - 1):
    entropy_attrs[df.columns[i]] = calc_avg(df, df.columns[i])

  x = len(df[df[feature] == var][df['play'] == target][feature])


In [24]:
entropy_attrs

{'outlook': 0.6935361388961914,
 'temp': 0.9110633930116756,
 'humidity': 0.7884504573082889,
 'windy': 0.892158928262361}

In [25]:
gain = {}
for key in entropy_attrs:
    gain[key] = entropy - entropy_attrs[key]

In [26]:
gain

{'outlook': 0.24674981977443977,
 'temp': 0.029222565658955535,
 'humidity': 0.15183550136234225,
 'windy': 0.048127030408270155}

In [29]:
def calc_avg(df, feature):
    target_values = pd.unique(df['play'])
    variables = df[feature].unique()
    avg = 0
    for var in variables:
        entropy = 0
        for target in target_values:
            x = len(df[df[feature] == var][df['play'] == target][feature])
            y = len(df[df[feature] == var][feature])
            e = x / (y + np.finfo(float).eps)
            entropy += -e * np.log2(e + np.finfo(float).eps)
        attr = y / len(df)
        avg += attr * entropy
    return avg

def calc_entropy():
    values = pd.unique(df['play'])
    entropy = 0
    n = len(df)
    for value in values:
        x = df['play'].value_counts()[value] / n
        entropy += -x * np.log2(x)
    return entropy

def find_node(df):
    entropy_attrs = {}
    entropy = calc_entropy()
    for i in range(len(df.columns) - 1):
        entropy_attrs[df.columns[i]] = calc_avg(df, df.columns[i])
    
    gain = {}
    for key in entropy_attrs:
        gain[key] = entropy - entropy_attrs[key]
    
    node = max(gain, key=gain.get)
    return node

def dropData(df, node, value):
    return df[df[node] == value].reset_index(drop=True)

def buildTree(df, tree=None):
    node = find_node(df)
    
    if tree is None:
        tree = {}
        tree[node] = {}
    
    values = df[node].unique()
    for val in values:
        sub_df = dropData(df, node, val)
        target, count = np.unique(sub_df['play'], return_counts = True)
        #[['no','yes'][2,3]]
        if len(target) == 1:
            tree[node][val] = target[0]
        else:
            tree[node][val] = buildTree(sub_df)
    return tree

In [30]:
buildTree(df)

  x = len(df[df[feature] == var][df['play'] == target][feature])


{'outlook': {'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}},
  'overcast': 'yes',
  'rainy': {'windy': {'false': 'yes', 'true': 'no'}}}}