In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('heart.csv')

In [11]:
def calculate_entropy(series):
    counts = series.value_counts()
    probs = counts / len(series)
    entropy = -np.sum(probs * np.log2(probs + 1e-10))
    return entropy

In [12]:
def calculate_information_gain(data, target, attribute):
    target_entropy = calculate_entropy(data[target])

    attribute_values = data[attribute].unique()
    weighted_attribute_entropy = 0

    for value in attribute_values:
        subset = data[data[attribute] == value]
        subset_weight = len(subset) / len(data)
        weighted_attribute_entropy += subset_weight * calculate_entropy(subset[target])

    information_gain = target_entropy - weighted_attribute_entropy
    return information_gain

In [13]:
target_entropy = calculate_entropy(data['HeartDisease'])
print(f"Entropy of 'HeartDisease': {target_entropy:.4f}")

Entropy of 'HeartDisease': 0.8454


In [14]:
attributes = ['Age', 'Sex', 'RestingBP', 'Cholesterol', 'RestingECG', 'ExerciseAngina']
for attribute in attributes:
    info_gain = calculate_information_gain(data, 'HeartDisease', attribute)
    print(f"Information Gain for '{attribute}': {info_gain:.4f}")

Information Gain for 'Age': 0.4131
Information Gain for 'Sex': 0.0495
Information Gain for 'RestingBP': 0.6635
Information Gain for 'Cholesterol': 0.8454
Information Gain for 'RestingECG': 0.0442
Information Gain for 'ExerciseAngina': 0.4336


In [16]:
attributes = ['Age', 'Sex', 'RestingBP', 'Cholesterol', 'RestingECG', 'ExerciseAngina']
info_gains = [calculate_information_gain(data, 'HeartDisease', attr) for attr in attributes]

In [17]:
root_attribute = attributes[np.argmax(info_gains)]
root_ig = np.max(info_gains)

In [18]:
print(f"Root Node: '{root_attribute}' with Information Gain: {root_ig:.4f}")

Root Node: 'Cholesterol' with Information Gain: 0.8454
