In [87]:
import pandas as pd
tennis=pd.read_csv('tennis.csv')
print(tennis)

     outlook  temp humidity    wind play
0      sunny   hot     high    Weak   no
1      sunny   hot     high  Strong   no
2   overcast   hot     high    Weak  yes
3      rainy  mild     high    Weak  yes
4      rainy  cool   normal    Weak  yes
5      rainy  cool   normal  Strong   no
6   overcast  cool   normal  Strong  yes
7      sunny  mild     high    Weak   no
8      sunny  cool   normal    Weak  yes
9      rainy  mild   normal    Weak  yes
10     sunny  mild   normal  Strong  yes
11  overcast  mild     high  Strong  yes
12  overcast   hot   normal    Weak  yes
13     rainy  mild     high  Strong   no


In [88]:
def entropy(probs):
    import math
    return sum([-prob*math.log(prob) for prob in probs])

In [89]:
def entropy_of_list(a_list):
    from collections import Counter
    cnt=Counter(x for x in a_list)
    num_instances=len(a_list)*1.0
    probs=[x/num_instances for x in cnt.values()]
    return entropy(probs)

In [90]:
def info_gain(df,split_attr,target_attr):
    df_split=df.groupby(split_attr)
    nobs=len(df)*1.0
    
    df_split_agg=df_split.agg({target_attr:[entropy_of_list,lambda x: len(x)/nobs]})[target_attr]
    df_split_agg.columns=['entropy','obs']
    old_entropy=entropy_of_list(df[target_attr])
    new_entropy=sum(df_split_agg['entropy']*df_split_agg['obs'])
    return old_entropy-new_entropy

In [91]:
def id3(df,target_attr,attr_names,default_class=None):
    from collections import Counter
    cnt=Counter(x for x in df[target_attr])
    if len(cnt)==1:
        return next(iter(cnt))
    elif df.empty or(not attr_names):
        return default_class
    else:
        default_class=max(cnt)
        
        gains=[info_gain(df,attr,target_attr) for attr in attr_names]
        index_of_max=gains.index(max(gains))
        best_attr=attr_names[index_of_max]
        tree={best_attr:{}}
        remaining_attr=[i for i in attr_names if i!=best_attr]
        for attr_val,data_subset in df.groupby(best_attr):
            subtree=id3(data_subset,target_attr,remaining_attr,default_class)
            tree[best_attr][attr_val]=subtree
        return tree

In [92]:
from pprint import pprint
tree=id3(tennis,'play',['outlook','temp','humidity','wind'])
pprint(tree)

{'outlook': {'overcast': 'yes',
             'rainy': {'wind': {'Strong': 'no', 'Weak': 'yes'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}


In [93]:
def clssify(instance,tree,default=None):
    attribute=next(iter(tree))
    if instance[attribute] in tree[attribute].keys():
        result=tree[attribute][instance[attribute]]
        if isinstance(result,dict):
            return classify(instance,result)
        else:
            return result
    else:
        return default

In [94]:
tennis['predicted'] = tennis.apply(classify, axis=1, args=(tree,'No') )

In [95]:
print(sum(tennis['predicted']==tennis['play'])/len(tennis))

1.0


In [96]:
tennis[['predicted','play']]

Unnamed: 0,predicted,play
0,no,no
1,no,no
2,yes,yes
3,yes,yes
4,yes,yes
5,no,no
6,yes,yes
7,no,no
8,yes,yes
9,yes,yes


In [113]:
train_data=tennis.iloc[:-5,:-1]
test_data=tennis.iloc[-5:,:-1]
train_tree=id3(train_data,'play',['outlook','temp','humidity','wind'])
test_data['predicted2'] = test_data.apply(classify, axis=1, args=(train_tree,'Yes') )

In [114]:
print(sum(test_data['predicted2']==test_data['play'])/len(test_data))

0.8


In [116]:
test_data[['predicted2','play']]

Unnamed: 0,predicted2,play
9,yes,yes
10,no,yes
11,yes,yes
12,yes,yes
13,no,no
