In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()

In [3]:
X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
y_df = pd.DataFrame(iris.target, columns=["species"])
data = pd.concat([X_df, y_df], axis=1)

In [4]:
# Function to calculate entropy
def calculate_entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

In [5]:
# Function to calculate information gain
def calculate_info_gain(data, split_attribute_name, target_name="species"):
    total_entropy = calculate_entropy(data[target_name])

    vals, counts = np.unique(data[split_attribute_name], return_counts=True)

    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * calculate_entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])

    info_gain = total_entropy - weighted_entropy
    return info_gain

In [6]:
total_entropy = calculate_entropy(data["species"])
print(f"Total Entropy of dataset: {total_entropy}")

Total Entropy of dataset: 1.584962500721156


In [7]:
for feature in iris.feature_names:
    info_gain = calculate_info_gain(data, feature)
    print(f"Information Gain for {feature}: {info_gain}")

Information Gain for sepal length (cm): 0.8769376208910578
Information Gain for sepal width (cm): 0.5166428756804977
Information Gain for petal length (cm): 1.4463165236458
Information Gain for petal width (cm): 1.4358978386754417


In [8]:
best_feature = max(iris.feature_names, key=lambda feature: calculate_info_gain(data, feature))
print(f"\nBest feature to split on: {best_feature}")


Best feature to split on: petal length (cm)
