In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
df = pd.read_csv("play_tennis.csv")
df.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [3]:
print(f'Rows: {df.shape[0]}, Columns: {df.shape[1]}')
print(df.columns)

Rows: 14, Columns: 6
Index(['day', 'outlook', 'temp', 'humidity', 'wind', 'play'], dtype='object')


In [4]:
def find_entropy(df):
    target = df.keys()[-1]
    entropy = 0
    values = df[target].unique()
    for value in values:
        fraction = df[target].value_counts()[value] / len(df[target])
        entropy += -fraction * np.log2(fraction)
    return entropy


In [8]:
def average_information(df, attribute):
    target = df.keys()[-1]
    target_variables = df[target].unique()
    variables = df[attribute].unique()

    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute] == variable][df[target] == target_variable])
            den = len(df[attribute][df[attribute] == variable])
            eps = 1e-10
            fraction = num / (den + eps)
            entropy += -fraction * math.log(fraction + eps)

        fraction2 = den / len(df)
        entropy2 += -fraction2 * entropy

    return abs(entropy2)



In [9]:
def find_winner(df):
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df) - average_information(df, key))
    return df.keys()[:-1][np.argmax(IG)]


In [10]:
def get_subtable(df, node, value):
    return df[df[node] == value].reset_index(drop=True)


In [11]:
def build_tree(df, tree=None):
    target = df.keys()[-1]
    node = find_winner(df)
    attValue = np.unique(df[node])

    if tree is None:
        tree = {}
        tree[node] = {}

    for value in attValue:
        subtable = get_subtable(df, node, value)
        clValue, counts = np.unique(subtable[target], return_counts=True)

        if len(counts) == 1:
            tree[node][value] = clValue[0]
        else:
            tree[node][value] = build_tree(subtable)

    return tree


In [12]:
tree = build_tree(df)

import pprint
pprint.pprint(tree)

{'day': {'D1': 'No',
         'D10': 'Yes',
         'D11': 'Yes',
         'D12': 'Yes',
         'D13': 'Yes',
         'D14': 'No',
         'D2': 'No',
         'D3': 'Yes',
         'D4': 'Yes',
         'D5': 'Yes',
         'D6': 'No',
         'D7': 'Yes',
         'D8': 'No',
         'D9': 'Yes'}}
