In [21]:
import pandas as pd
import numpy as np
from math import log2
import pprint

In [22]:
import pandas as pd

data = pd.DataFrame([
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Overcast', 'Hot', 'High', 'Strong', 'Yes'],
    ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
    ['Sunny', 'Mild', 'High', 'Weak', 'No'],
    ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
    ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Strong', 'No']
], columns=['Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis'])


In [23]:
def entropy(target_col):
  values, counts = np.unique(target_col, return_counts=True)
  entropy = np.sum([(-counts[i] / np.sum(counts)) * log2(counts[i] / np.sum(counts)) for i in range(len(values))])
  return entropy

In [24]:
def info_gain(data, split_attribute_name, target_name="PlayTennis"):
  total_entropy = entropy(data[target_name])
  vals, counts = np.unique(data[split_attribute_name], return_counts=True)

  weighted_entropy = np.sum([(counts[i] / np.sum(counts)) *
                             entropy(data[data[split_attribute_name] == vals[i]][target_name])
                             for i in range(len(vals))])
  information_gain = total_entropy - weighted_entropy
  return information_gain

In [25]:
def ID3(data, original_data, features, target_attribute_name="PlayTennis", parent_node_class=None):
    # If all target values have the same class, return that class
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    # If dataset is empty, return the most common class from original data
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(
            np.unique(original_data[target_attribute_name], return_counts=True)[1])]

    # If there are no features left to split on, return parent class
    elif len(features) == 0:
        return parent_node_class

    # Main case: build the tree
    else:
        # Set the default value for the parent node class (majority class)
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(
            np.unique(data[target_attribute_name], return_counts=True)[1])]

        # Calculate information gain for each feature
        item_values = [info_gain(data, feature, target_attribute_name) for feature in features]

        # Choose the best feature with max info gain
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        # Create the tree structure with the best feature
        tree = {best_feature: {}}

        # Remove the best feature from the feature list
        features = [f for f in features if f != best_feature]

        # Recurse on each possible value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()

            subtree = ID3(sub_data, original_data, features, target_attribute_name, parent_node_class)

            tree[best_feature][value] = subtree

        return tree


In [26]:
import pprint

features = list(data.columns)
features.remove('PlayTennis')

tree = ID3(data, data, features, target_attribute_name='PlayTennis')
pprint.pprint(tree)


{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [27]:
import pprint
pprint.pprint(tree)

{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [28]:
def predict(query, tree, default='Yes'):
    for attr in query:
        if attr in tree:
            try:
                result = tree[attr][query[attr]]
            except:
                return default

            if isinstance(result, dict):
                return predict(query, result, default)
            else:
                return result
    return default


In [29]:
sample = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
perdiction = predict(sample, tree)
print("\nPredicted Output for sample is:", perdiction)


Predicted Output for sample is: No
