# Study effect of log base on information gain

In [1]:
import common
import numpy as np
import pandas as pd

In [2]:
def compute_entropy(y, base):
    # number of samples
    n = len(y)
    # unique classes in `y_class`
    classes = set(y)

    # compute number of observations in each class
    n_c = np.zeros(len(classes))
    # compute probability for a sample to be in class
    p_c = np.zeros(len(classes))
    # compute entropy for each class
    e_c = np.zeros(len(classes))

    # computation
    for i, c in enumerate(classes):
        n_c[i] = np.sum(y == c)
        p_c[i] = n_c[i] / n
        e_c[i] = -p_c[i] * np.emath.logn(base, p_c[i])
    assert n == sum(n_c)

    # total entropy
    e_p = np.sum(e_c)

    # return
    return e_p, n

## Load data

In [3]:
df = pd.read_parquet("dr_d_grocery.parquet")
features = [
    "average_revenue",
    "average_basket_size",
    "fraction_canned_food",
    "fraction_national_brands",
]
target_names = df["segment_name"].unique()

In [4]:
X = df[features].values
y = df["segment_name"].values

In [5]:
X.shape

(150, 4)

In [6]:
len(y)

150

## Let's look at a single feature, and compute information gain for all cutoff values

In [7]:
def get_best_split_feature(x, base=2):
    unique_values = np.sort(np.unique(x))
    cutoffs = (unique_values[:-1] + unique_values[1:]) / 2  # mid-point between two feature values

    _information_gain = []
    for i, cutoff in enumerate(cutoffs):
        i, cutoff

        yhat = X[:, 0] <= cutoff

        e_p, n_p = compute_entropy(y, base)
        e_cl, n_cl = compute_entropy(y[yhat], base)
        e_cr, n_cr = compute_entropy(y[~yhat], base)

        entropy_before = e_p  # parent node
        entropy_after = n_cl / n_p * e_cl + n_cr / n_p * e_cr  # children nodes

        _information_gain.append(
            pd.DataFrame(
                {
                    "cutoff": [cutoff],
                    "information_gain": entropy_before - entropy_after,
                }
            )
        )
    return pd.concat(_information_gain)

In [8]:
information_gain_base2 = get_best_split_feature(X[:, 0], base=2)

In [9]:
information_gain_base3 = get_best_split_feature(X[:, 0], base=3)

In [10]:
overview = pd.merge(
    information_gain_base2,
    information_gain_base3,
    on="cutoff",
    suffixes=["_base2", "_base3"]
)

In [11]:
overview.sort_values("information_gain_base2", ascending=False).head(3)

Unnamed: 0,cutoff,information_gain_base2,information_gain_base3
49,231.87727,0.918296,0.57938
50,282.124807,0.877672,0.553749
48,184.649257,0.864338,0.545336


In [12]:
overview.sort_values("information_gain_base3", ascending=False).head(3)

Unnamed: 0,cutoff,information_gain_base2,information_gain_base3
49,231.87727,0.918296,0.57938
50,282.124807,0.877672,0.553749
48,184.649257,0.864338,0.545336




<br>
<br>
<b>Learning from Big Data</b> <br>
Sebastian Gabel <br>