In [1]:
import pandas as pd
from pandas import DataFrame
from pprint import pprint
df_tennis = pd.read_csv('data.csv')

In [2]:
attribute_names = list(df_tennis.columns)
attribute_names.remove('Play')

print(attribute_names)

['day', 'outlook', 'temperature', 'humidity', 'wind']


In [3]:
def entropy_of_list(lst):
    from collections import Counter
    count = Counter(x for x in lst)
    num_instances = len(lst)*1.
    probs = [x/num_instances for x in count.values()]
    return entropy(probs)

In [4]:
def entropy(probs):
    import math
    return sum([-prob*math.log(prob,2) for prob in probs])

In [5]:
total_entropy = entropy_of_list(df_tennis['Play'])

In [6]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    df_split = df.groupby(split_attribute_name)
    nobs = len(df.index)*1.
    df_agg_ent = df_split.agg({target_attribute_name:[entropy_of_list, lambda x:len(x)/nobs]})
    df_agg_ent.columns = ['Entropy','propobservations']
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['propobservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    print(split_attribute_name, 'IG :',old_entropy - new_entropy)
    return old_entropy - new_entropy

In [7]:
def id3(df, target_attribute_name, attribute_names, default_class = None):
    from collections import Counter
    count = Counter(x for x in df[target_attribute_name])
    if len(count)==1:
        return next(iter(count))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(count.keys())
        gain = [
            information_gain(df, attr, target_attribute_name) for attr in attribute_names
        ]
        print()
        index_of_max = gain.index(max(gain))
        best_attr = attribute_names[index_of_max]
        
        tree = {best_attr:{}}
        
        remaining_attribute_names = [ i for i in attribute_names if i!= best_attr ]
        
        for attr_val, data_subset in df.groupby(best_attr):
                subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)
                tree[best_attr][attr_val] = subtree
        
        return tree

In [8]:

tree = id3(df_tennis, 'Play', attribute_names)
print("\n\nThe Resultant Decision Tree is:\n")
pprint(tree)

day IG : 0.9402859586706309
outlook IG : 0.2467498197744391
temperature IG : 0.029222565658954647
humidity IG : 0.15183550136234136
wind IG : 0.04812703040826927



The Resultant Decision Tree is:

{'day': {'d1': 'no',
         'd10': 'yes',
         'd11': 'yes',
         'd12': 'yes',
         'd13': 'yes',
         'd14': 'no',
         'd2': 'no',
         'd3': 'yes',
         'd4': 'yes',
         'd5': 'yes',
         'd6': 'no',
         'd7': 'yes',
         'd8': 'no',
         'd9': 'yes'}}
