In [1]:
import pandas as pd
import numpy as np

In [2]:
# define dataset
data = pd.DataFrame(
    [
        ("young", "high", False, "fair", False),
        ("young", "high", False, "excellent", False),
        ("middle", "high", False, "fair", True),
        ("senior", "medium", False, "fair", True),
        ("senior", "low", True, "fair", True),
        ("senior", "low", True, "excellent", False),
        ("middle", "low", True, "excellent", True),
        ("young", "medium", False, "fair", False),
        ("young", "low", True, "fair", True),
        ("senior", "medium", True, "fair", True),
        ("young", "medium", True, "excellent", True),
        ("middle", "medium", False, "excellent", True),
        ("middle", "high", True, "fair", True),
        ("senior", "medium", False, "excellent", False),
    ], 
    columns=["age", "income", "student", "credit_rating", "buys_computer"]
)
target = "buys_computer"
data

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,young,high,False,fair,False
1,young,high,False,excellent,False
2,middle,high,False,fair,True
3,senior,medium,False,fair,True
4,senior,low,True,fair,True
5,senior,low,True,excellent,False
6,middle,low,True,excellent,True
7,young,medium,False,fair,False
8,young,low,True,fair,True
9,senior,medium,True,fair,True


In [3]:
# Calculate entropy of a series
def entropy(p: pd.Series):
    # count the occurrences of each value
    counts = p.value_counts()
    total = counts.sum()
    # calculate the entropy
    return counts.apply(lambda x: -x/total*np.log2(x/total)).sum()

# Calculate entropy of a feature splitting the data by the target
def f_entropy(data: pd.DataFrame, feature: str, target: str = target):
    # count the occurrences of each feature value
    counts = data[feature].value_counts()
    total = counts.sum()

    results = []
    for value, count in counts.items():
        # find the target values for the feature value
        split = data[data[feature] == value][target]
        # calculate the entropy weighted by the count of the feature value
        results.append(entropy(split) * count / total)

    # return the sum of the weighted entropies as feature entropy
    return np.sum(results)

# Calculate information gain of a feature splitting the data by the target
def gain(data: pd.DataFrame, feature: str, target: str = target):
    return entropy(data[target]) - f_entropy(data, feature, target)

def round(x: np.ndarray, d: int = 3):
    return np.round(x, decimals=d)

print("Dataset entropy: %.3f" % round(entropy(data[target]), 3))
print()

for feature in ["age", "income", "student", "credit_rating"]:
    print("Entropy of feature %s: %.3f" % (feature, round(f_entropy(data, feature))))
print()

for feature in ["age", "income", "student", "credit_rating"]:
    print("Information gain of feature %s: %.3f" % (feature, round(gain(data, feature))))

Dataset entropy: 0.940

Entropy of feature age: 0.694
Entropy of feature income: 0.911
Entropy of feature student: 0.788
Entropy of feature credit_rating: 0.892

Information gain of feature age: 0.247
Information gain of feature income: 0.029
Information gain of feature student: 0.152
Information gain of feature credit_rating: 0.048


In [4]:
# Calculate the intrinsic value (split information) of a feature
def iv(data: pd.DataFrame, feature: str):
    return entropy(data[feature])

# Calculate the normalized information gain of a feature
def n_gain(data: pd.DataFrame, feature: str, target: str = target):
    info_gain = gain(data, feature, target)
    split_info = iv(data, feature)
    return info_gain / split_info

print("Normalized information gain of feature age: %.3f" % round(n_gain(data, "age")))
print("Normalized information gain of feature income: %.3f" % round(n_gain(data, "income")))
print("Normalized information gain of feature student: %.3f" % round(n_gain(data, "student")))
print("Normalized information gain of feature credit_rating: %.3f" % round(n_gain(data, "credit_rating")))

Normalized information gain of feature age: 0.156
Normalized information gain of feature income: 0.019
Normalized information gain of feature student: 0.152
Normalized information gain of feature credit_rating: 0.049
