In [53]:
import pandas as pd
df_tennis = pd.read_csv('PlayTennis.csv')
#df_tennis=df_tennis[:11] #for checking accuracy


In [54]:
#Just for understanding
from collections import Counter
cnt = Counter(x for x in df_tennis['Wind'])# class of YES /NO
print(cnt)
#iter - returns an iterator object and next elements can be accessed using next()
lst ={'foo','bar','baz'}
print(next(iter(lst)))

Counter({'Weak': 8, 'Strong': 6})
foo


In [55]:
#Function to calculate the entropy of collection S
import math
def entropy(probs):  
    return sum( [-prob*math.log(prob, 2) for prob in probs])

def entropy_of_list(ls):  
    from collections import Counter
    class_count = Counter(x for x in ls)# Counter calculates the proportion of class
    #print('Target attribute class count(Yes/No)=',dict(class_count))
    total_instances = len(ls)  
    probs = [x / total_instances for x in class_count.values()]  
    #print(probs)
    return entropy(probs) # Call Entropy

In [56]:
def information_gain(df, split_attr,tg_attr):
    df_split = df.groupby(split_attr) # group the data based on attribute values
    nobs = len(df.index)

    df_agg = df_split.agg({tg_attr:[entropy_of_list, 
                            lambda x: len(x)/nobs]})[tg_attr]
      
    df_agg.columns=['Entropy','Proportion']

    # Calculate Information Gain:
    new_entropy = sum( df_agg['Entropy'] * df_agg['Proportion'])
    old_entropy = entropy_of_list(df[tg_attr])
    return old_entropy - new_entropy

In [57]:
def id3(df, tg_attr, attr, default_class=None,default_attr='S'):
    
    from collections import Counter
    pos_neg = Counter(x for x in df[tg_attr])# class of YES /NO
    print(pos_neg,len(pos_neg))
    if len(pos_neg) == 1: # return Yes or No.
        #print('Return', next(iter(pos_neg)))
        return next(iter(pos_neg))  
    
    elif df.empty or (not attr):
        #print('Empty', default_class)
        return default_class  # Return None for Empty Data Set
    
    else:
        default_class = max(pos_neg.keys()) #No of YES and NO Class
        print(f'default class {default_class}')
        gains={}
        print('On attribute ', default_attr, attr)
        for a in attr:
            gains[a] = information_gain(df, a, tg_attr)
            print(f'Information gain of {a}:{gains[a]:0.4f}')
        
        #best attribute   
        best_attr = max(gains, key=lambda x: gains[x])
        print("\nAttribute with the maximum gain is: ", best_attr)
       
        tree = {best_attr:{}} # Initiate the tree with best attribute as a node  
        attr.remove(best_attr) #remove the best attribute
      
        for attr_val, data in df.groupby(best_attr):
            subtree = id3(data,tg_attr, attr,default_class,best_attr)
            tree[best_attr][attr_val] = subtree
            print('best_attr, attr_val',best_attr, attr_val)
            print('Tree',tree)
        return tree

In [58]:
# Get Predictor Names (all but 'class')
attr = list(df_tennis.columns)
print("List of Attributes:", attr) 
attr.remove('PlayTennis') #Remove the class attribute 
print("Predicting Attributes:", attr)

List of Attributes: ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']
Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Wind']


In [59]:
from pprint import pprint
tree = id3(df_tennis,'PlayTennis',attr)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)
ba = next(iter(tree))
print("Best Attribute :\n",ba)
print("Tree Keys:\n",tree[ba].keys())

Counter({'Yes': 9, 'No': 5}) 2
default class Yes
On attribute  S ['Outlook', 'Temperature', 'Humidity', 'Wind']
Information gain of Outlook:0.2467
Information gain of Temperature:0.0292
Information gain of Humidity:0.1518
Information gain of Wind:0.0481

Attribute with the maximum gain is:  Outlook
Counter({'Yes': 4}) 1
best_attr, attr_val Outlook Overcast
Tree {'Outlook': {'Overcast': 'Yes'}}
Counter({'Yes': 3, 'No': 2}) 2
default class Yes
On attribute  Outlook ['Temperature', 'Humidity', 'Wind']
Information gain of Temperature:0.0200
Information gain of Humidity:0.0200
Information gain of Wind:0.9710

Attribute with the maximum gain is:  Wind
Counter({'No': 2}) 1
best_attr, attr_val Wind Strong
Tree {'Wind': {'Strong': 'No'}}
Counter({'Yes': 3}) 1
best_attr, attr_val Wind Weak
Tree {'Wind': {'Strong': 'No', 'Weak': 'Yes'}}
best_attr, attr_val Outlook Rain
Tree {'Outlook': {'Overcast': 'Yes', 'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}}}}
Counter({'No': 3, 'Yes': 2}) 2
default c

In [60]:
def classify(instance, tree,default=None): # Instance of Play Tennis with Predicted    
    attribute = next(iter(tree)) # Outlook/Humidity/Wind       
    if instance[attribute] in tree[attribute].keys(): # Value of the attributs in  set of Tree keys  
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

In [61]:

df_new=pd.read_csv('PlayTennis.csv')
df_new=df_new[11:]
df_new['predicted'] = df_new.apply(classify, axis=1, args=(tree,'?')) 
print(df_new)

   PlayTennis   Outlook Temperature Humidity    Wind predicted
11        Yes  Overcast        Mild     High  Strong       Yes
12        Yes  Overcast         Hot   Normal    Weak       Yes
13         No      Rain        Mild     High  Strong        No
