In [1]:
import numpy as np
import pandas as pd
from numpy import log2 as log

In [2]:
dataset = {
    "outlook" : ['sunny','sunny','overcast','rainy','rainy','rainy','overcast','sunny','sunny','rainy','sunny','overcast','overcast','rainy'],
    "humidity" : ['high','high','high','high','normal','normal','normal','high','normal','normal','normal','high','normal','high'],
    "windy" : ['false','true','false','false','false','true','true','false','false','false','true','true','false','true'],
    "play" : ['no','no','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no']
}

In [3]:
df = pd.DataFrame(dataset)

In [4]:
df

Unnamed: 0,outlook,humidity,windy,play
0,sunny,high,False,no
1,sunny,high,True,no
2,overcast,high,False,yes
3,rainy,high,False,yes
4,rainy,normal,False,yes
5,rainy,normal,True,no
6,overcast,normal,True,yes
7,sunny,high,False,no
8,sunny,normal,False,yes
9,rainy,normal,False,yes


In [5]:
pd.value_counts(df['play'])

yes    9
no     5
Name: play, dtype: int64

In [6]:
values = pd.unique(df['play'])

In [7]:
values

array(['no', 'yes'], dtype=object)

In [8]:
df['play'].value_counts()['no']

5

In [9]:
df['play'].value_counts()['yes']

9

In [35]:
target_entropy = 0
for value in values:
    x = df['play'].value_counts()[value] / len(df)
    target_entropy += -x * log(x)

In [36]:
target_entropy

0.9402859586706311

In [12]:
var = df['outlook'].unique()

In [13]:
var

array(['sunny', 'overcast', 'rainy'], dtype=object)

In [14]:
count = df['outlook'].value_counts()

In [15]:
count

rainy       5
sunny       5
overcast    4
Name: outlook, dtype: int64

In [16]:
df.groupby('outlook')['play'].value_counts()

outlook   play
overcast  yes     4
rainy     yes     3
          no      2
sunny     no      3
          yes     2
Name: play, dtype: int64

In [17]:
group = df.groupby('outlook')['play'].value_counts()

In [18]:
df[df['outlook'] == 'sunny'][df['play'] == 'yes']

  """Entry point for launching an IPython kernel.


Unnamed: 0,outlook,humidity,windy,play
8,sunny,normal,False,yes
10,sunny,normal,True,yes


In [19]:
df[df['outlook'] == 'sunny'][df['play'] == 'yes']['outlook']

  """Entry point for launching an IPython kernel.


8     sunny
10    sunny
Name: outlook, dtype: object

In [21]:
target_var = pd.unique(df['play'])

In [22]:
var

array(['sunny', 'overcast', 'rainy'], dtype=object)

In [33]:
avg = 0
for v in var:
    entropy = 0
    for target in target_var:
        x = len(df[df['outlook'] == v][df['play'] == target]['outlook'])
        y = len(df[df['outlook'] == v]['outlook'])
        print(x,y)
        e = x/y
        entropy += -e * log(e + np.finfo(float).eps)
    attr = y / len(df)
    avg += attr * entropy

3 5
2 5
0 4
4 4
2 5
3 5


  """


In [34]:
entropy, avg

(0.970950594454668, 0.6935361388961914)

In [38]:
target_entropy - avg

0.24674981977443977

In [50]:
def calculate_avg(df, feature):
    avg = 0
    var = df[feature].unique()
    for v in var:
        entropy = 0
        for target in target_var:
            x = len(df[df[feature] == v][df['play'] == target][feature])
            y = len(df[df[feature] == v][feature])
            e = x/(y + np.finfo(float).eps)
            entropy += -e * log(e + np.finfo(float).eps)
        attr = y / len(df)
        avg += attr * entropy
    return avg

In [51]:
calculate_avg(df,'outlook')

  import sys


0.6935361388961914

In [52]:
calculate_avg(df,'windy')

  import sys


0.892158928262361

In [53]:
calculate_avg(df,'humidity')

  import sys


0.7884504573082889

In [54]:
entropy_attributes = {}
for i in range(len(df.columns) - 1):
    entropy_attributes[df.columns[i]] = calculate_avg(df, df.columns[i])

  import sys


In [55]:
entropy_attributes

{'outlook': 0.6935361388961914,
 'humidity': 0.7884504573082889,
 'windy': 0.892158928262361}

In [58]:
gain = {}
for key in entropy_attributes:
    gain[key] = target_entropy - entropy_attributes[key]

In [59]:
gain

{'outlook': 0.24674981977443977,
 'humidity': 0.15183550136234225,
 'windy': 0.048127030408270155}

In [60]:
max(gain.items(), key = lambda x : x[1])

('outlook', 0.24674981977443977)

In [61]:
gain.items()

dict_items([('outlook', 0.24674981977443977), ('humidity', 0.15183550136234225), ('windy', 0.048127030408270155)])

# Complete Algo of Decision Tree

In [63]:
def calculate_avg(df, feature):
    avg = 0
    var = df[feature].unique()
    for v in var:
        entropy = 0
        for target in target_var:
            x = len(df[df[feature] == v][df['play'] == target][feature])
            y = len(df[df[feature] == v][feature])
            e = x/(y + np.finfo(float).eps)
            entropy += -e * log(e + np.finfo(float).eps)
        attr = y / len(df)
        avg += attr * entropy
    return avg

def calculate_entropy():
    target_entropy = 0
    for value in values:
        x = df['play'].value_counts()[value] / len(df)
        target_entropy += -x * log(x)
    return target_entropy

def find_node(df):
    entropy_attributes = {}
    for i in range(len(df.columns) - 1):
        entropy_attributes[df.columns[i]] = calculate_avg(df, df.columns[i])
    target_entropy = calculate_entropy()
    gain = {}
    for key in entropy_attributes:
        gain[key] = target_entropy - entropy_attributes[key]
    
    node = max(gain.items(), key = lambda x : x[1])[0]
    return node

def dropData(df, node, value):
    return df[df[node] == value].reset_index(drop=True)

def buildTree(df, tree=None):
    node = find_node(df)
    if tree is None:
        tree = {}
        tree[node] = {}
    values = df[node].unique()
    for val in values:
        sub_df = dropData(df, node, val)
        targets, count = np.unique(sub_df['play'], return_counts = True)
        if len(targets) == 1:
            tree[node][val] = targets[0]
        else:
            tree[node][val] = buildTree(sub_df)
    return tree

In [64]:
buildTree(df)

  import sys


{'outlook': {'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}},
  'overcast': 'yes',
  'rainy': {'windy': {'false': 'yes', 'true': 'no'}}}}