In [46]:
import pandas as pd
import numpy as np

In [47]:
# define dataset
data = pd.DataFrame(
    [
        ("young", "high", False, "fair", False),
        ("young", "high", False, "excellent", False),
        ("middle", "high", False, "fair", True),
        ("senior", "medium", False, "fair", True),
        ("senior", "low", True, "fair", True),
        ("senior", "low", True, "excellent", False),
        ("middle", "low", True, "excellent", True),
        ("young", "medium", False, "fair", False),
        ("young", "low", True, "fair", True),
        ("senior", "medium", True, "fair", True),
        ("young", "medium", True, "excellent", True),
        ("middle", "medium", False, "excellent", True),
        ("middle", "high", True, "fair", True),
        ("senior", "medium", False, "excellent", False),
    ], 
    columns=["age", "income", "student", "credit_rating", "buys_computer"]
)
target = "buys_computer"
data

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,young,high,False,fair,False
1,young,high,False,excellent,False
2,middle,high,False,fair,True
3,senior,medium,False,fair,True
4,senior,low,True,fair,True
5,senior,low,True,excellent,False
6,middle,low,True,excellent,True
7,young,medium,False,fair,False
8,young,low,True,fair,True
9,senior,medium,True,fair,True


In [48]:
# Calculate entropy of a series
def entropy(p: pd.Series) -> float:
    # count the occurrences of each value
    counts = p.value_counts()
    total = counts.sum()
    # calculate the entropy
    return counts.apply(lambda x: -x/total*np.log2(x/total)).sum()

# Calculate entropy of a feature splitting the data by the target
def f_entropy(data: pd.DataFrame, feature: str, target: str = target) -> float:
    # count the occurrences of each feature value
    counts = data[feature].value_counts()
    total = counts.sum()

    results = []
    for value, count in counts.items():
        # find the target values for the feature value
        split = data[data[feature] == value][target]
        # calculate the entropy weighted by the count of the feature value
        results.append(entropy(split) * count / total)

    # return the sum of the weighted entropies as feature entropy
    return np.sum(results)

# Calculate information gain of a feature splitting the data by the target
def gain(data: pd.DataFrame, feature: str, target: str = target) -> float:
    return entropy(data[target]) - f_entropy(data, feature, target)

def round(x: np.ndarray, d: int = 3) -> float:
    return np.round(x, decimals=d)

print("Dataset entropy: %.3f" % round(entropy(data[target]), 3))
print()

print("Feature entropies and information gain:")
results = pd.DataFrame({
    feature: {
        "entropy": round(f_entropy(data, feature)),
        "gain": round(gain(data, feature))
    } for feature in ["age", "income", "student", "credit_rating"]
}).T.sort_values("gain", ascending=False)
results

Dataset entropy: 0.940

Feature entropies and information gain:


Unnamed: 0,entropy,gain
age,0.694,0.247
student,0.788,0.152
credit_rating,0.892,0.048
income,0.911,0.029


In [49]:
# Calculate the intrinsic value (split information) of a feature
def iv(data: pd.DataFrame, feature: str) -> float:
    return entropy(data[feature])

# Calculate the normalized information gain of a feature
def n_gain(data: pd.DataFrame, feature: str, target: str = target) -> float:
    info_gain = gain(data, feature, target)
    split_info = iv(data, feature)
    return info_gain / split_info

print("Feature intrinsic value, and normalized information gain:")
results = pd.DataFrame({
    feature: {
        "intrinsic_value": round(iv(data, feature)),
        "normalized_gain": round(n_gain(data, feature))
    } for feature in ["age", "income", "student", "credit_rating"]
}).T.sort_values("normalized_gain", ascending=False)
results

Feature intrinsic value, and normalized information gain:


Unnamed: 0,intrinsic_value,normalized_gain
age,1.577,0.156
student,1.0,0.152
credit_rating,0.985,0.049
income,1.557,0.019


In [50]:
# Calculate the Gini impurity of a series
def gini(p: pd.Series) -> float:
    # count the occurrences of each value
    counts = p.value_counts()
    total = counts.sum()
    # calculate the Gini impurity
    return 1 - counts.apply(lambda x: (x/total)**2).sum()

# Calculate the Gini impurity of a feature splitting the data by the target
def f_gini(data: pd.DataFrame, feature: str, target: str = target) -> float:
    # count the occurrences of each feature value
    counts = data[feature].value_counts()
    total = counts.sum()

    best_gini = 0
    for value, count in counts.items():
        # find the target values for the feature value
        split1 = data[data[feature] == value][target]
        split2 = data[data[feature] != value][target]
        # calculate the binary Gini impurity weighted by the count of the feature value
        gini1 = gini(split1) * count / total
        gini2 = gini(split2) * (total - count) / total
        best_gini = np.max([best_gini, gini1 + gini2])

    # return the sum of the weighted impurities as feature impurity
    return best_gini

print("Dataset Gini index: %.3f" % round(gini(data[target]), 3))
print()

print("Feature Gini impurities:")
results = pd.DataFrame({
    feature: {
        "gini": round(f_gini(data, feature))
    } for feature in ["age", "income", "student", "credit_rating"]
}).T.sort_values("gini", ascending=False)
results

Dataset Gini index: 0.459

Feature Gini impurities:


Unnamed: 0,gini
income,0.458
age,0.457
credit_rating,0.429
student,0.367
