In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
def entropy(data, target_attr):
    values = [row[target_attr] for row in data]
    value_counts = {value: values.count(value) for value in set(values)}

    ent = 0.0
    for count in value_counts.values():
        prob = count/len(data)
        ent -= prob*math.log2(prob)
    
    return ent

In [3]:
def info_gain_ratio(data, target_attr, feature):
    total_entropy = entropy(data, target_attr)

    feature_values = [row[feature] for row in data]

    feature_entropy = 0.0
    split_info = 0.0

    for value in set(feature_values):
        subset = [row for row in data if row[feature]==value]
        prob = len(subset)/len(data)

        feature_entropy += prob*entropy(subset, target_attr)
        split_info -= prob*math.log2(prob) if prob>0 else 0

    info_gain = total_entropy-feature_entropy
    gain_ratio = info_gain/split_info if split_info!=0 else 0

    return gain_ratio 

In [4]:
def best_feature(data, target_attr, features):
    gain_ratios = {feature: info_gain_ratio(data, target_attr, feature) for feature in features}
    best_feature = max(gain_ratios, key=gain_ratios.get)
    return best_feature

In [5]:
def c4_5(data, target_attr, features):
    target_values = [row[target_attr] for row in data]
    if target_values.count(target_values[0])==len(target_values):
        return target_values[0]
    
    if not features:
        return max(set(target_values), key=target_values.count)
    
    best = best_feature(data, target_attr, features)
    tree = {best: {}}
    features.remove(best)

    feature_values = set(row[best] for row in data)
    for value in feature_values:
        subset = [row for row in data if row[best]==value]
        subtree = c4_5(subset, target_attr, features)
        tree[best][value] = subtree

    return tree

In [6]:
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    
    feature = next(iter(tree))
    feature_value = sample.get(feature)
    subtree = tree[feature].get(feature_value)

    if subtree is None:
        return None
    return predict(subtree, sample)

In [8]:
data = [
    {'Outlook': 'Sunny', 'Temp.': 85, 'Humidity': 85, 'Wind': 'Weak', 'Decision': 'No'},
    {'Outlook': 'Sunny', 'Temp.': 80, 'Humidity': 90, 'Wind': 'Strong', 'Decision': 'No'},
    {'Outlook': 'Overcast', 'Temp.': 83, 'Humidity': 78, 'Wind': 'Weak', 'Decision': 'Yes'},
    {'Outlook': 'Rain', 'Temp.': 70, 'Humidity': 96, 'Wind': 'Weak', 'Decision': 'Yes'},
    {'Outlook': 'Rain', 'Temp.': 68, 'Humidity': 80, 'Wind': 'Weak', 'Decision': 'Yes'},
    {'Outlook': 'Rain', 'Temp.': 65, 'Humidity': 70, 'Wind': 'Strong', 'Decision': 'No'},
    {'Outlook': 'Overcast', 'Temp.': 64, 'Humidity': 65, 'Wind': 'Strong', 'Decision': 'Yes'},
    {'Outlook': 'Sunny', 'Temp.': 72, 'Humidity': 95, 'Wind': 'Weak', 'Decision': 'No'},
    {'Outlook': 'Sunny', 'Temp.': 69, 'Humidity': 70, 'Wind': 'Weak', 'Decision': 'Yes'},
    {'Outlook': 'Rain', 'Temp.': 75, 'Humidity': 80, 'Wind': 'Weak', 'Decision': 'Yes'},
    {'Outlook': 'Sunny', 'Temp.': 75, 'Humidity': 70, 'Wind': 'Strong', 'Decision': 'Yes'},
    {'Outlook': 'Overcast', 'Temp.': 72, 'Humidity': 90, 'Wind': 'Strong', 'Decision': 'Yes'},
    {'Outlook': 'Overcast', 'Temp.': 81, 'Humidity': 75, 'Wind': 'Weak', 'Decision': 'Yes'},
    {'Outlook': 'Rain', 'Temp.': 71, 'Humidity': 80, 'Wind': 'Strong', 'Decision': 'No'}
]

target_attr = 'Decision'
features = ['Outlook', 'Temp.', 'Humidity', 'Wind']

tree = c4_5(data, target_attr, features)
print(f'Decision tree : {tree}')

new_sample = {'Outlook': 'Sunny', 'Temp.': 75, 'Humidity': 70, 'Wind': 'weak'}
prediction = predict(tree, new_sample)

print(f'Prediction : {prediction}')

Decision tree : {'Temp.': {64: 'Yes', 65: 'No', 68: 'Yes', 69: 'Yes', 70: 'Yes', 71: 'No', 72: {'Outlook': {'Overcast': 'Yes', 'Sunny': 'No'}}, 75: 'Yes', 80: 'No', 81: 'Yes', 83: 'Yes', 85: 'No'}}
Prediction : Yes
