# Relative Entropy Experiments

In [1]:
!pip install faker pyarrow

Collecting faker
  Downloading Faker-18.7.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
Installing collected packages: faker
Successfully installed faker-18.7.0


In [20]:
import pyarrow as pa
from pyarrow import compute as pc
from faker import Faker
from rich.jupyter import print
Faker.seed(0)
fake = Faker()

In [106]:
def gini_index(x: pa.Array):
    value_counts = pc.value_counts(x)
    value_probs = pc.divide(value_counts.field('counts'), float(len(x)))
    return pc.subtract(1, pc.sum(pc.multiply(value_probs, value_probs)))

In [None]:
def simpson_index(x: pa.Array):
    value_counts = pc.value_counts(x)
    value_probs = pc.divide(value_counts.field('counts'), float(len(x)))
    return pc.subtract(1, pc.sum(pc.multiply(value_probs, value_probs)))

In [99]:
# Generate some data...

LEN_DATA = 1000
Faker.seed(0)
countries = pa.array([fake.country() for _ in range(LEN_DATA)])
mostly_usa = pa.array(["United States" for _ in range(LEN_DATA - 1)] + ["Canada"])
half_usa = pa.array(["United States" for _ in range(LEN_DATA // 2)] + ["Canada" for _ in range(LEN_DATA // 2)])

In [100]:
len(countries), len(pc.unique(countries))

(1000, 236)

In [101]:
len(mostly_usa), len(pc.unique(mostly_usa))

(1000, 2)

In [102]:
len(half_usa), len(pc.unique(half_usa))

(1000, 2)

In [93]:
def pc_entropy(probs):
    return pc.negate(pc.sum(pc.multiply(probs, pc.log2(probs))))

In [94]:
def relative_entropy(x: pa.Array):
    value_counts = pc.value_counts(x)
    value_probs = pc.divide(value_counts.field('counts'), float(len(x)))
    entropy = pc_entropy(value_probs)

    uniform_prob = pc.divide(1, pc.cast(len(value_counts), pa.float64()))
    uniform_entropy = pc_entropy(uniform_prob)

    score = pc.divide(pc.subtract(uniform_entropy, entropy), uniform_entropy)
    return score.as_py()

In [95]:
relative_entropy(countries)

-229.0526279590857

In [96]:
relative_entropy(mostly_usa)

0.9771844845250777

In [103]:
relative_entropy(half_usa)

-1.0

In [55]:
pc.divide(pa.array([10, 20, 30]), 5.5)

<pyarrow.lib.DoubleArray object at 0x10a3dab00>
[
  1.8181818181818181,
  3.6363636363636362,
  5.454545454545454
]

In [105]:
relative_entropy(pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

-9.0