In [211]:
import numpy as np
import pandas as pd

In [212]:
# Generate random binary attribute values
X = pd.DataFrame(np.random.randint(2, size=(1000, 5)), columns=['attr1', 'attr2', 'attr3', 'attr4', 'attr5'])

# Generate random binary class labels
y = pd.DataFrame(np.random.randint(2, size=(1000, 1)), columns=['class_label'])

# Concatenate attribute and label data into a single DataFrame
df = pd.concat([X, y], axis=1)

In [213]:
df.head()

Unnamed: 0,attr1,attr2,attr3,attr4,attr5,class_label
0,0,0,0,0,0,0
1,1,0,0,1,0,1
2,0,1,0,0,1,0
3,1,0,1,1,0,0
4,1,1,0,0,1,0


In [214]:
len(df)

1000

## Write a program to compute the `information gain` in a binary split of a node in a decision tree in terms of `Gini index`. Use this to find the best attribute for a binary split. Assume all attributes are binary and the class labels are also binary.

## Gini Index Calculation

In [215]:
# gini index calculation
def gini(a, b):
    return 1 - np.power(a / (a + b), 2) - np.power(b / (a + b), 2)

## Calculating the `GINI(Parent)`

In [216]:
# before splitting
C_0 = len(df[df['class_label'] == 0]) # no. of classes with 0
C_1 = len(df[df['class_label'] == 1]) # no. of classes with 1

In [217]:
print(C_0, C_1)

500 500


In [218]:
# Gini of parent
M_0 = gini(C_0, C_1)
M_0

0.5

## Calculating the Gain

In [219]:
def gain(attr):
    C_0_0, C_1_0 = 0, 0 # when class is 0
    C_0_1, C_1_1 = 0, 0 # when class is 1

    for i, j in zip(attr, df.class_label):
        if i == 0: # if the value is 0
            if j == 0: # if the class is 0
                C_0_0 += 1
            else: # class is 1
                C_1_0 += 1
        else: # if the value is 1
            if j == 0: # if the class is 0
                C_0_1 += 1
            else: # class is 1
                C_1_1 += 1
    
    s1 = C_0_0 + C_1_0 # total number of points if the value is 0
    s2 = C_0_1 + C_1_1 # total number of points if the value is 1
    
    g1 = gini(C_0_0, C_1_0) # gini index if the values is 0
    g2 = gini(C_0_1, C_1_1) # gini index if the values is 1
    
    M_attr = (s1 * g1 + s2 * g2) / len(df) # gini children
    return M_0 - M_attr # gain

In [220]:
# all the attributes in a list
attbs = [df.attr1, df.attr2, df.attr3, df.attr4, df.attr5]

# store all the results
results = [gain(i) for i in attbs]
results

[0.0001620006480025915,
 0.0030484505213030366,
 0.00019999999999997797,
 0.0002880184331797153,
 2.0058490557972597e-06]

In [221]:
max(results), results.index(max(results))

(0.0030484505213030366, 1)

In [222]:
print(f"So, we are gonna split the attribute '{df.columns[results.index(max(results))]}', which has the maximum Gain.")

So, we are gonna split the attribute 'attr2', which has the maximum Gain.


---
## Write a program to compute the `information gain` in a binary split of a node in a decision tree in terms of `entropy`. Use this to find the best attribute for a binary split. Assume all attributes are binary and the class labels are also binary

In [223]:
def entropy(a, b):
    return - ((a / (a + b)) * np.log2(a / (a + b)) + (b / (a + b)) * np.log2(b / (a + b)))

In [224]:
# Entropy of the parent
E_P = entropy(C_0, C_1)
E_P

1.0

In [225]:
def gain_info(attr):
    C_0_0, C_1_0 = 0, 0 # when class is 0
    C_0_1, C_1_1 = 0, 0 # when class is 1

    for i, j in zip(attr, df.class_label):
        if i == 0: # if the value is 0
            if j == 0: # if the class is 0
                C_0_0 += 1
            else: # class is 1
                C_1_0 += 1
        else: # if the value is 1
            if j == 0: # if the class is 0
                C_0_1 += 1
            else: # class is 1
                C_1_1 += 1
    
    s1 = C_0_0 + C_1_0 # total number of points if the value is 0
    s2 = C_0_1 + C_1_1 # total number of points if the value is 1
    
    E_1 = entropy(C_0_0, C_1_0)
    E_2 = entropy(C_0_1, C_1_1)
    
    E_C = (s1 * E_1 + s2 * E_2) / len(df)
    
    return E_P - E_C 

In [226]:
# store all the results
results = [gain_info(i) for i in attbs]
results

[0.00023373015412930798,
 0.0044025025826093245,
 0.00028855824719009604,
 0.0004155626773865162,
 2.893830443273515e-06]

In [227]:
max(results), results.index(max(results))

(0.0044025025826093245, 1)

In [228]:
print(f"So, we are gonna split the attribute '{df.columns[results.index(max(results))]}', which has the maximum Gain.")

So, we are gonna split the attribute 'attr2', which has the maximum Gain.
