In [1]:
import pandas as pd
import math as mt
from collections import Counter

In [2]:
df = pd.read_csv("Weather - ID3.csv")
print(df)

    id   outlook temperature humidity    wind play
0    1     sunny         hot     high    weak   no
1    2     sunny         hot     high  strong   no
2    3  overcast        mild   normal    weak  yes
3    4     rainy        mild     high    weak  yes
4    5     rainy        cool   normal    weak  yes
5    6     rainy        cool   normal  strong   no
6    7  overcast        cool   normal  strong  yes
7    8     sunny        mild     high    weak   no
8    9     sunny        cool   normal    weak  yes
9   10     rainy        mild   normal    weak  yes
10  11     sunny        mild   normal  strong  yes
11  12  overcast        mild     high  strong  yes
12  13  overcast         hot   normal    weak  yes
13  14     rainy        mild     high  strong   no


In [3]:
def entropy(probs):
    return sum([-prob * mt.log(prob, 2) for prob in probs])
def entropy_of_list(a_list):
    cnt = Counter(x for x in a_list)
    num_instances = len(a_list)
    probs = [x/num_instances for x in cnt.values()]
    return entropy(probs)

In [4]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    # split data by possible vals of attribute
    df_split = df.groupby(split_attribute_name)
    # proportion of Obs in Each data_split
    nobs = len(df.index)
    df_agg_ent = df_split.agg({target_attribute_name: [entropy_of_list, lambda x: len(x) / nobs]})[
        target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    # Calculate Information Gain:
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy

In [5]:
def id3_algorithm(df, target_attribute_name, attribute_names, default_class=None):
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:
        return next(iter(cnt))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        # Get Default Value for next recursive call of this function:
        default_class = max(cnt.keys())
        # Compute the information gain of the attribute:
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
        # index of best attribute
        index_of_max = gainz.index(max(gainz))
        # choose best attribute to split on
        best_attr = attribute_names[index_of_max]
        # create an empty tree, to be populated in a moment
        # Iniiate the tree with best attribute as a node
        tree = {best_attr: {}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        # Split dataset
        # On each split, recursively call this algorithm.
        # populate the empty tree with subtrees, which
        # are the result of the recursive call
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3_algorithm(data_subset, target_attribute_name, remaining_attribute_names, default_class)
            tree[best_attr][attr_val] = subtree
        return tree

In [6]:
def classify(instance, tree, default=None):
    
    attribute = next(iter(tree))
    if instance[attribute] in tree[attribute].keys():  
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict):  
            return classify(instance, result)
        else:
            return result 
    else:
        return default

In [7]:
total_entropy = entropy_of_list(df['play'])
print("Total Entropy of PlayTennis Data Set:",total_entropy)

Total Entropy of PlayTennis Data Set: 0.9402859586706309


In [8]:
print('Info-gain for Outlook is :'+str( information_gain(df, 'outlook', 'play')),"\n")
print('Info-gain for Humidity is: ' + str( information_gain(df, 'humidity', 'play')),"\n")
print('Info-gain for Wind is:' + str( information_gain(df, 'wind', 'play')),"\n")
print('Info-gain for Temperature is:' + str( information_gain(df, 'temperature','play')),"\n")

Info-gain for Outlook is :0.2467498197744391 

Info-gain for Humidity is: 0.23612234796179465 

Info-gain for Wind is:0.04812703040826927 

Info-gain for Temperature is:0.08015424581588682 



In [9]:
attribute_names = list(df.columns)
print("List of Attributes:", attribute_names)

attribute_names.remove('play')
attribute_names.remove('id')
print("Predicting Attributes:", attribute_names)

tree = id3_algorithm(df, 'play', attribute_names)
print("Decision tree:\n",tree)

List of Attributes: ['id', 'outlook', 'temperature', 'humidity', 'wind', 'play']
Predicting Attributes: ['outlook', 'temperature', 'humidity', 'wind']
Decision tree:
 {'outlook': {'overcast': 'yes', 'rainy': {'wind': {'strong': 'no', 'weak': 'yes'}}, 'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}


In [10]:
print("Decision tree predict:")
df['Predicted'] = df.apply(classify, axis=1, args=(tree, 'No'))
print(df['Predicted'])
print('\n Accuracy is:\n' + str( sum(df['play']==df['Predicted'] ) / (1.0*len(df.index)) ))
df[['play', 'Predicted']]

Decision tree predict:
0      no
1      no
2     yes
3     yes
4     yes
5      no
6     yes
7      no
8     yes
9     yes
10    yes
11    yes
12    yes
13     no
Name: Predicted, dtype: object

 Accuracy is:
1.0


Unnamed: 0,play,Predicted
0,no,no
1,no,no
2,yes,yes
3,yes,yes
4,yes,yes
5,no,no
6,yes,yes
7,no,no
8,yes,yes
9,yes,yes


In [11]:
training_data = df.iloc[1:10]
test_data  = df.iloc[-4:]
train_tree = id3_algorithm(training_data, 'play', attribute_names)
test_data['predicted2'] = test_data.apply(                               
                                          classify, 
                                          axis=1, 
                                          args=(train_tree,'Yes') ) 


print ('\n\n Accuracy is : ' + str( sum(test_data['play']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))



 Accuracy is : 0.75


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted2'] = test_data.apply(
