In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def entropy(data, target_attr):
    values = [row[target_attr] for row in data]
    value_counts = {value: values.count(value) for value in set(values)}

    ent = 0.0
    for count in value_counts.values():
        prob = count/len(data)
        ent -=prob * math.log2(prob)

    return ent

In [3]:
def info_gain(data, target_attr, feature):
    total_entropy = entropy(data, target_attr)

    values = [row[feature] for row in data]
    feature_values = set(values)

    feature_entropy = 0.0
    for value in feature_values:
        subset = [row for row in data if row[feature]==value]
        prob = len(subset)/len(data)
        feature_entropy += prob * entropy(subset, target_attr)

    return total_entropy - feature_entropy

In [4]:
def id3(data, target_attr, features):
    # Base Case : if all rows have same label then return that label
    target_values = [row[target_attr] for row in data]
    if target_values.count(target_values[0])==len(target_values):
        return target_values[0]
    
    # Base Case : If there are no more features to split
    if not features:
        return max(target_values, key=target_values.count)
    
    gains = {feature: info_gain(data, target_attr, feature) for feature in features}
    best_feature = max(gains, key=gains.get)

    tree = {best_feature:{}}
    features.remove(best_feature)

    feature_values = set(row[best_feature] for row in data)
    for value in feature_values:
        subset = [row for row in data if row[best_feature]==value]
        subtree = id3(subset, target_attr, features)
        tree[best_feature][value] = subtree

    return tree

In [5]:
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    
    feature = next(iter(tree))
    feature_value = sample.get(feature)
    subtree = tree[feature].get(feature_value)

    if subtree is None:
        return None
    return predict(subtree, sample)

In [6]:
data = [
    {'Age': 30, 'Blood Pressure': 'High', 'Cholesterol': 'High', 'Diagnosis': 'Sick'},
    {'Age': 45, 'Blood Pressure': 'Low', 'Cholesterol': 'Normal', 'Diagnosis': 'Healthy'},
    {'Age': 50, 'Blood Pressure': 'High', 'Cholesterol': 'High', 'Diagnosis': 'Sick'},
    {'Age': 35, 'Blood Pressure': 'Low', 'Cholesterol': 'Normal', 'Diagnosis': 'Healthy'},
    {'Age': 60, 'Blood Pressure': 'High', 'Cholesterol': 'High', 'Diagnosis': 'Sick'},
    {'Age': 55, 'Blood Pressure': 'Low', 'Cholesterol': 'Normal', 'Diagnosis': 'Healthy'},
    {'Age': 40, 'Blood Pressure': 'High', 'Cholesterol': 'High', 'Diagnosis': 'Sick'},
    {'Age': 25, 'Blood Pressure': 'Low', 'Cholesterol': 'Normal', 'Diagnosis': 'Healthy'},
    {'Age': 65, 'Blood Pressure': 'High', 'Cholesterol': 'High', 'Diagnosis': 'Sick'},
    {'Age': 45, 'Blood Pressure': 'Low', 'Cholesterol': 'Normal', 'Diagnosis': 'Healthy'}
]

target_attr = 'Diagnosis'
features = ['Age', 'Blood Pressure', 'Cholesterol']

tree = id3(data, target_attr, features)
print(f'Decision tree : {tree}')

new_patient = {'Age': 50, 'Blood Pressure': 'Low', 'Cholesterol': 'Normal'}
prediction = predict(tree, new_patient)
print('Prediction for new patient: ', prediction)

Decision tree : {'Age': {65: 'Sick', 35: 'Healthy', 40: 'Sick', 45: 'Healthy', 50: 'Sick', 55: 'Healthy', 25: 'Healthy', 60: 'Sick', 30: 'Sick'}}
Prediction for new patient:  Sick
