In [25]:
import math
import pandas as pd

column_names = ['outlook', 'windy', 'play']
data = pd.read_csv('val.csv', header=None, names=column_names)

In [26]:
def entropy(labels):
    total = len(labels)
    counts = labels.value_counts()
    return -sum((count / total) * math.log2(count / total) for count in counts)

In [27]:
labels = data['play']
H_play = entropy(labels)
print(f"Entropy(Play) = {H_play:.4f}")

Entropy(Play) = 1.0000


In [28]:
def info_gain(df, attr):
    total = len(df)
    grouped = df.groupby(attr)['play']
    H_after = sum((len(group) / total) * entropy(group) for _, group in grouped)
    return H_play - H_after, H_after

In [29]:
for feature in ['outlook', 'windy']:
    ig, H_after = info_gain(data, feature)
    print(f"Attribute {feature}: Entropy after split = {H_after:.4f}, Information Gain = {ig:.4f}")

Attribute outlook: Entropy after split = 0.3333, Information Gain = 0.6667
Attribute windy: Entropy after split = 0.9183, Information Gain = 0.0817


In [30]:
ig_vals = {f: info_gain(data,f)[0] for f in ['outlook','windy']}
best = max(ig_vals, key=ig_vals.get)
print(f"Best attribute for root: {best}\n")

Best attribute for root: outlook



In [31]:
def gini_index(labels):
    total = len(labels)
    counts = labels.value_counts()
    gini = 1.0 - sum((count/total)**2 for count in counts)
    return gini

In [32]:
g_full = gini_index(data['play'])
print(f"Gini(full) = {g_full:.4f}")

Gini(full) = 0.5000


In [33]:
for feat in ['outlook', 'windy']:
    subsets = data.groupby(feat)['play']
    weighted = sum((len(group)/len(data)) * gini_index(group) for _, group in subsets)
    print(f"Gini after splitting on {feat} = {weighted:.4f}")
    print(f"Gini Gain = {g_full - weighted:.4f}\n")

Gini after splitting on outlook = 0.1667
Gini Gain = 0.3333

Gini after splitting on windy = 0.4444
Gini Gain = 0.0556

