In [9]:
import pandas as pd
from collections import Counter
df_tennis = pd.read_csv('PlayTennis.csv')
#df_tennis=df_tennis[:11] #for checking accuracy


In [10]:
#Function to calculate the entropy of collection S
import math
def entropy(probs):  
    return sum( [-prob*math.log(prob, 2) for prob in probs])

def entropy_of_list(ls):  

    class_count = Counter(x for x in ls)# Counter calculates the proportion of class
    total_instances = len(ls)  
    probs = [x / total_instances for x in class_count.values()]  

    return entropy(probs) # Call Entropy

In [11]:
def information_gain(df, split_attr,tg_attr):
    df_split = df.groupby(split_attr) # group the data based on attribute values
    datalen = len(df.index)

    df_agg = df_split.agg({tg_attr:[entropy_of_list, lambda x: len(x)/datalen]})[tg_attr]
      
    df_agg.columns=['Entropy','Proportion']

    # Calculate Information Gain:
    new_entropy = sum( df_agg['Entropy'] * df_agg['Proportion'])
    old_entropy = entropy_of_list(df[tg_attr])
    return old_entropy - new_entropy

In [12]:
def id3(df, tg_attr, attr, default_class=None,default_attr='S'):
    pos_neg = Counter(x for x in df[tg_attr])# class of YES /NO

    if len(pos_neg) == 1: # return Yes or No.
        return next(iter(pos_neg))  
    
    elif df.empty or (not attr):
        return default_class  # Return None for Empty Data Set otherwise Yes/No 
    
    else:
        default_class = max(pos_neg.keys()) #assigning whichever is higher -> YES and NO Class max(pos_neg. key = lambda x: pos_neg[x])
        gains={}
    
        for a in attr:
            gains[a] = information_gain(df, a, tg_attr)
        
        #best attribute   
        best_attr = max(gains, key=lambda x: gains[x])
       
        tree = {best_attr:{}} # Initiate the tree with best attribute as a node  
        attr.remove(best_attr) #remove the best attribute
      
        for attr_val, data in df.groupby(best_attr):
            subtree = id3(data,tg_attr, attr,default_class,best_attr)
            tree[best_attr][attr_val] = subtree

        return tree

In [13]:
# Get Predictor Names (all but 'class')
attr = list(df_tennis.columns)
print("List of Attributes:", attr) 
attr.remove('PlayTennis') #Remove the class attribute 
print("Predicting Attributes:", attr)

List of Attributes: ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']
Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Wind']


In [14]:
from pprint import pprint
tree = id3(df_tennis,'PlayTennis',attr)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)
ba = next(iter(tree))
print("Best Attribute :\n",ba)
print("Tree Keys:\n",tree[ba].keys())



The Resultant Decision Tree is :

{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}
Best Attribute :
 Outlook
Tree Keys:
 dict_keys(['Overcast', 'Rain', 'Sunny'])


In [15]:
def classify(instance, tree,default=None): # Instance of Play Tennis with Predicted    
    attribute = next(iter(tree)) # Outlook/Humidity/Wind       
    if instance[attribute] in tree[attribute].keys(): # Value of the attributs in  set of Tree keys  
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

In [16]:

df_new=pd.read_csv('PlayTennis.csv')
df_new=df_new[11:]
df_new['predicted'] = df_new.apply(classify, axis=1, args=(tree,'?')) 
print(df_new)

   PlayTennis   Outlook Temperature Humidity    Wind predicted
11        Yes  Overcast        Mild     High  Strong       Yes
12        Yes  Overcast         Hot   Normal    Weak       Yes
13         No      Rain        Mild     High  Strong        No
