In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from math import log2

In [12]:
data = pd.read_csv('adult.data.csv', header=None)
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
           'marital-status', 'occupation', 'relationship', 'race',
           'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
           'native-country', 'salary']
data.columns = columns

In [13]:
label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [14]:
def entropy(y):
    counts = Counter(y)
    total = len(y)
    return -sum((count / total) * log2(count / total) for count in counts.values())


In [15]:
def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = sum(
        (counts[i] / sum(counts)) * entropy(data[data[feature] == values[i]][target])
        for i in range(len(values))
    )
    return total_entropy - weighted_entropy


In [17]:
target = 'salary'
info_gains = {}
for feature in data.columns[:-1]:  # Exclude the target variable
    info_gains[feature] = information_gain(data, feature, target)

In [18]:
sorted_info_gains = sorted(info_gains.items(), key=lambda x: x[1], reverse=True)
best_split = sorted_info_gains[0]
second_best_split = sorted_info_gains[1]

print(f"Best split: {best_split}")
print(f"Second-best split: {second_best_split}")

Best split: ('fnlwgt', 0.56849716589775)
Second-best split: ('relationship', 0.16586536515994932)
