In [42]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
df_tennis=pd.read_csv('tennis.csv')

In [43]:
df_tennis

Unnamed: 0,outlook,temp,humidity,windy,PlayTennis
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [44]:
from collections import Counter
def entropy_list(a_list):
    cnt=Counter(x for x in a_list)
    num_instance=len(a_list)*1.0
    probs=[x/num_instance for x in cnt.values()]
    return entropy(probs)

In [45]:
import math
def entropy(probs):
    return sum([-prob*math.log(prob,2) for prob in probs])

In [46]:
def info_gain(df,split,target,trace=0):
    df_split=df.groupby(split)
    nobs=len(df.index)*1.0
    df_agg_ent=df_split.agg({target:[entropy_list,lambda x:len(x)/nobs]})
    print(df_agg_ent)
    df_agg_ent.columns=['Entropy','PropObserved']
    new_entropy=sum(df_agg_ent['Entropy']*df_agg_ent["PropObserved"])
    old_entropy=entropy_list(df[target])
    return old_entropy-new_entropy

In [47]:
def id3(df,target,attribute_name,default_class=None):
    cnt = Counter(x for x in df[target])
    if len(cnt)==1:
        return next(iter(cnt))
    elif df.empty or (not attribute_name):
        return default_class
    else:
        default_class=max(cnt.keys())
        gains=[info_gain(df,attr,target)for attr in attribute_name]
        index_max=gains.index(max(gains))
        best_attr=attribute_name[index_max]
        tree={best_attr:{}}
        remaining_attr=[x for x in attribute_name if x!=best_attr]
        for attr_val,data_subset in df.groupby(best_attr):
            subtree=id3(data_subset,target,remaining_attr,default_class)
            tree[best_attr][attr_val]=subtree
            return tree

In [48]:
def classify(instance,tree,default=None):
    attribute=next(iter(tree))
    if instance[attribute]in tree[attribute].keys():
        result=tree[attribute][instance[attribute]]
        if isinstance(result,dict):
            return classify(instance,result)
        else:
            return result
    else:
        return default

In [49]:
attribute_names=list(df_tennis.columns)
attribute_names.remove('PlayTennis')
tree=id3(df_tennis,'PlayTennis',attribute_names)
print("\n\nThe Resultant Decision Tree is:\n")
print(tree)

           PlayTennis           
         entropy_list <lambda_0>
outlook                         
overcast     0.000000   0.285714
rainy        0.970951   0.357143
sunny        0.970951   0.357143
       PlayTennis           
     entropy_list <lambda_0>
temp                        
cool     0.811278   0.285714
hot      1.000000   0.285714
mild     0.918296   0.428571
           PlayTennis           
         entropy_list <lambda_0>
humidity                        
high         0.985228        0.5
normal       0.591673        0.5
        PlayTennis           
      entropy_list <lambda_0>
windy                        
False     0.811278   0.571429
True      1.000000   0.428571


The Resultant Decision Tree is:

{'outlook': {'overcast': 'yes'}}


In [50]:
training_data=df_tennis.iloc[1:-4]
test_data=df_tennis.iloc[-4:]
train_tree=id3(training_data,'PlayTennis',attribute_names)
print("\n\nThe Resultant Decision train_tree is:\n")
print(train_tree)
test_data['predicted2']=test_data.apply(classify,axis=1,args=(train_tree,'Yes'))
print('\n\nTraining the model for a few samples,and again predicting\'playtennis\'for remaining attribute')
print('The Accuracy for new trained data is:'+str(sum(test_data['PlayTennis']==test_data['predicted2'])/1.0*len(test_data.index)) )

           PlayTennis           
         entropy_list <lambda_0>
outlook                         
overcast     0.000000   0.222222
rainy        0.811278   0.444444
sunny        0.918296   0.333333
       PlayTennis           
     entropy_list <lambda_0>
temp                        
cool     0.811278   0.444444
hot      1.000000   0.222222
mild     0.918296   0.333333
           PlayTennis           
         entropy_list <lambda_0>
humidity                        
high         1.000000   0.444444
normal       0.721928   0.555556
        PlayTennis           
      entropy_list <lambda_0>
windy                        
False     0.650022   0.666667
True      0.918296   0.333333


The Resultant Decision train_tree is:

{'outlook': {'overcast': 'yes'}}


Training the model for a few samples,and again predicting'playtennis'for remaining attribute
The Accuracy for new trained data is:8.0
