In [1]:
import pandas as pd
df_tennis = pd.read_csv('PlayTennis.csv')
#df_tennis=df_tennis[:11] #for checking accuracy
print(df_tennis)
group = df_tennis.groupby('Outlook')
print(group.groups)

# for name, g in group:
#   print(name)
#   print(g)

print(len(df_tennis.index))

   PlayTennis   Outlook Temperature Humidity    Wind
0          No     Sunny         Hot     High    Weak
1          No     Sunny         Hot     High  Strong
2         Yes  Overcast         Hot     High    Weak
3         Yes      Rain        Mild     High    Weak
4         Yes      Rain        Cool   Normal    Weak
5          No      Rain        Cool   Normal  Strong
6         Yes  Overcast        Cool   Normal  Strong
7          No     Sunny        Mild     High    Weak
8         Yes     Sunny        Cool   Normal    Weak
9         Yes      Rain        Mild   Normal    Weak
10        Yes     Sunny        Mild   Normal  Strong
11        Yes  Overcast        Mild     High  Strong
12        Yes  Overcast         Hot   Normal    Weak
13         No      Rain        Mild     High  Strong
{'Overcast': [2, 6, 11, 12], 'Rain': [3, 4, 5, 9, 13], 'Sunny': [0, 1, 7, 8, 10]}
14


In [2]:
#Function to calculate the entropy of collection S
import math
def entropy(probs):  
    return sum( [-prob*math.log(prob, 2) for prob in probs])

def entropy_of_list(ls):
    print('entropy----------\n',ls) 
    from collections import Counter
    class_count = Counter(x for x in ls)# Counter calculates the proportion of class
    #print('Target attribute class count(Yes/No)=',dict(class_count))
    total_instances = len(ls)  
    probs = [x / total_instances for x in class_count.values()]  
    #print(probs)
    return entropy(probs) # Call Entropy

In [3]:
def information_gain(df, split_attr,tg_attr):
    df_split = df.groupby(split_attr) # group the data based on attribute values
    nobs = len(df.index) #calc len of dataset
    
    df_agg = df_split.agg({tg_attr:[entropy_of_list, lambda x: len(x)/nobs]})[tg_attr]
   
    df_agg.columns=['Entropy','Proportion']
    print('*****aggregate****',df_agg)
    # Calculate Information Gain:
    new_entropy = sum( df_agg['Entropy'] * df_agg['Proportion'])
    old_entropy = entropy_of_list(df[tg_attr])
    return old_entropy - new_entropy

In [4]:
#default attribute - S is the initial attribute 
def id3(df, tg_attr, attr, default_class=None,default_attr='S'):
    
    from collections import Counter
    pos_neg = Counter(x for x in df[tg_attr])# class of YES /NO
    print('***********id3 counter**********\n',pos_neg,len(pos_neg),'\n')
    if len(pos_neg) == 1: # if there is only Yes or only No, return Yes or No.
        # print('Return', next(iter(pos_neg)))
        return next(iter(pos_neg))  
    
    elif df.empty or (not attr):
        #print('Empty', default_class)
        return default_class  # Return None for Empty Data Set
    
    else:
        #whatever is more i.e Yes or No 
        default_class = max(pos_neg.keys()) #No of YES and NO Class
        print(f'default class {default_class}')
        gains={}
        print('On attribute ', default_attr, attr)

        #calc info gain of each attribute 
        for a in attr:
            gains[a] = information_gain(df, a, tg_attr)
            print(f'Information gain of {a}:{gains[a]:0.4f}')
        
        #best attribute   
        best_attr = max(gains, key=lambda x: gains[x])
        print("\nAttribute with the maximum gain is: ", best_attr)
       
        tree = {best_attr:{}} # Initiate the tree with best attribute as a node  
        attr.remove(best_attr) #remove the best attribute
      
        for attr_val, data in df.groupby(best_attr):
            subtree = id3(data,tg_attr, attr,default_class,best_attr)
            tree[best_attr][attr_val] = subtree
            print('best_attr, attr_val',best_attr, attr_val)
            print('Tree',tree)
        return tree

In [5]:
# Get Predictor Names (all but 'class')
attr = list(df_tennis.columns)
print("List of Attributes:", attr) 
attr.remove('PlayTennis') #Remove the class attribute 
print("Predicting Attributes:", attr)

List of Attributes: ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']
Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Wind']


In [6]:
from pprint import pprint
tree = id3(df_tennis,'PlayTennis',attr)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)
ba = next(iter(tree))
print("Best Attribute :\n",ba)
print("Tree Keys:\n",tree[ba].keys())

***********id3 counter**********
 Counter({'Yes': 9, 'No': 5}) 2 

default class Yes
On attribute  S ['Outlook', 'Temperature', 'Humidity', 'Wind']
entropy----------
 2     Yes
6     Yes
11    Yes
12    Yes
Name: PlayTennis, dtype: object
entropy----------
 3     Yes
4     Yes
5      No
9     Yes
13     No
Name: PlayTennis, dtype: object
entropy----------
 0      No
1      No
7      No
8     Yes
10    Yes
Name: PlayTennis, dtype: object
*****aggregate****            Entropy  Proportion
Outlook                       
Overcast  0.000000    0.285714
Rain      0.970951    0.357143
Sunny     0.970951    0.357143
entropy----------
 0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: PlayTennis, dtype: object
Information gain of Outlook:0.2467
entropy----------
 4    Yes
5     No
6    Yes
8    Yes
Name: PlayTennis, dtype: object
entropy----------
 0      No
1      No
2     Yes
12    Yes
Name: PlayTen

In [7]:
def classify(instance, tree,default=None): # Instance of Play Tennis with Predicted    
    attribute = next(iter(tree)) # Outlook/Humidity/Wind       
    if instance[attribute] in tree[attribute].keys(): # Value of the attributs in  set of Tree keys  
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

In [8]:

df_new=pd.read_csv('PlayTennis.csv')
df_new=df_new[11:]
df_new['predicted'] = df_new.apply(classify, axis=1, args=(tree,'?')) 
print(df_new)

   PlayTennis   Outlook Temperature Humidity    Wind predicted
11        Yes  Overcast        Mild     High  Strong       Yes
12        Yes  Overcast         Hot   Normal    Weak       Yes
13         No      Rain        Mild     High  Strong        No


In [9]:
#********************************Just for understanding**********************************
from collections import Counter
""" cnt = Counter(x for x in df_tennis['Wind'])# class of YES /NO
print(cnt)
#iter - returns an iterator object and next elements can be accessed using next()
lst ={'foo','bar','baz'}
print(next(iter(lst)))

print(sum([1,2,3,4]))

dic = {'abc':4}
print(len(dic)) """

df = pd.DataFrame([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9],
                   [20,30,40]],
                  columns=['A', 'B', 'C'])
print(df,'\n')
dfa = df['A'] #returns a dataframe again
print(dfa)
print(df.agg(['sum', 'min']),'\n') #same function applied to each column
print(df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})) #different function applied to different columns

    A   B   C
0   1   2   3
1   4   5   6
2   7   8   9
3  20  30  40 

0     1
1     4
2     7
3    20
Name: A, dtype: int64
      A   B   C
sum  32  45  58
min   1   2   3 

        A     B
sum  32.0   NaN
min   1.0   2.0
max   NaN  30.0
