In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
dataset = pd.read_csv("data.csv")
print("Dataset is:\n", dataset)

Dataset is:
      Outlook Temperature Humidity    Wind Target
0      sunny         hot     high    weak     no
1      sunny         hot     high  strong     no
2   overcast         hot     high    weak    yes
3       rain        mild     high    weak    yes
4       rain        cool   normal    weak    yes
5       rain        cool   normal  strong     no
6   overcast        cool   normal  strong    yes
7      sunny        mild     high    weak     no
8      sunny        cool   normal    weak    yes
9       rain        mild   normal    weak    yes
10     sunny        mild   normal  strong    yes
11  overcast        mild     high  strong    yes
12  overcast         hot   normal    weak    yes
13      rain        mild     high  strong     no


In [3]:
training_data = dataset.iloc[:80].reset_index(drop=True)
print(training_data)

     Outlook Temperature Humidity    Wind Target
0      sunny         hot     high    weak     no
1      sunny         hot     high  strong     no
2   overcast         hot     high    weak    yes
3       rain        mild     high    weak    yes
4       rain        cool   normal    weak    yes
5       rain        cool   normal  strong     no
6   overcast        cool   normal  strong    yes
7      sunny        mild     high    weak     no
8      sunny        cool   normal    weak    yes
9       rain        mild   normal    weak    yes
10     sunny        mild   normal  strong    yes
11  overcast        mild     high  strong    yes
12  overcast         hot   normal    weak    yes
13      rain        mild     high  strong     no


In [4]:
def entropy(column):
    unique_values, unique_values_frequencies = np.unique(column, return_counts=True)
    entropy = 0
    for i in range(len(unique_values)):
        frequency_ratio = unique_values_frequencies[i]/np.sum(unique_values_frequencies)
        entropy += (-frequency_ratio)*np.log2(frequency_ratio)
    return entropy

In [5]:
def ID3(data, features, most_freq_target_value):
    
    unique_target_values, unique_target_values_frequencies = np.unique(data['Target'], return_counts=True)
    
    #same target values -> add that Value as Leaf
    if len(unique_target_values) <= 1:
        return unique_target_values[0]
    
    #empty data -> add the Most Freq. Target Value of Training Data as Leaf
    elif len(data) == 0:
        return td_most_freq_target_value
    
    
    #empty feature space -> add Previous Parent's Most Freq. Target Value as Leaf
    elif len(features) == 0:
        return most_freq_target_value
    
    else:
        #default value
        most_freq_target_value = unique_target_values[np.argmax(unique_target_values_frequencies)]
        
        #total data entropy
        total_entropy = entropy(data[target])
        
        ig_of_features = []
        
        #information gain of each feature = total_entropy - weighted_entropy
        for feature in features:
            sub_features, sub_feature_frequencies = np.unique(dataset[feature], return_counts = True)
            
            weighted_entropy = 0
            for i in range(len(sub_features)):
                
                #entropy of each sub feature
                sub_feature_table = data.where(data[feature] == sub_features[i]).dropna()
                sub_feature_entropy = entropy(sub_feature_table[target])
                
                sub_feature_freq_ratio = sub_feature_frequencies[i]/np.sum(sub_feature_frequencies)
                
                weighted_entropy += sub_feature_freq_ratio * sub_feature_entropy
              
            information_gain = total_entropy - weighted_entropy
            ig_of_features.append(information_gain)
        
        #Max Information Gain feature is the Best Feature
        best_feature = features[np.argmax(ig_of_features)]
        
        #add best feature as root of tree
        tree = {best_feature:{}}
        
        #remove Best Feature from Feature Space
        features = [ feature for feature in features if feature != best_feature ]
        
        #split dataset by Best Feature's sub features
        for sub_feature in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == sub_feature).dropna()
            #run id3 on split dataset
            sub_tree = ID3(sub_data, features, most_freq_target_value)
        
            #add subtree to tree
            tree[best_feature][sub_feature] = sub_tree
            
        return tree

In [6]:
features = training_data.columns[:-1]
target = 'Target'

td_unique_target_values, td_unique_target_values_frequencies = np.unique(training_data[target], return_counts=True)
td_most_freq_target_value = td_unique_target_values[np.argmax(td_unique_target_values_frequencies)]
most_freq_target_value = td_most_freq_target_value

pprint(ID3(training_data, features, most_freq_target_value))

{'Outlook': {'overcast': 'yes',
             'rain': {'Wind': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'Humidity': {'high': 'no', 'normal': 'yes'}}}}
