# Surprise
- Self-information
$$I(x)=\log\frac{1}{p(x)}$$

In [8]:
import numpy as np

np.set_printoptions(precision=4)

acc = np.array([
    [6,  1],
    [1, 10],
    [7,  7]
])
print(f'counts each cases:\n{acc}')
prob = acc / acc.sum(axis=1).reshape(-1, 1)
print(f'probability:\n{prob}')
surp = np.log2(1 / prob)
print(f'surprise:\n{surp}')

counts each cases:
[[ 6  1]
 [ 1 10]
 [ 7  7]]
probability:
[[0.8571 0.1429]
 [0.0909 0.9091]
 [0.5    0.5   ]]
surprise:
[[0.2224 2.8074]
 [3.4594 0.1375]
 [1.     1.    ]]


# Entropy
- Expected surprise
- Average self-information
- Uncertainty, disorderliness

$$
\begin{align}
H(x) &= \mathbb{E}[I(x)] \\
     &= \sum_xp(x)\log\frac{1}{p(x)} \\
\end{align}
$$

In [10]:
entropy = (prob * surp).sum(axis=1)
print('entropy:\n', entropy)

entropy:
 [0.5917 0.4395 1.    ]


# Cross Entropy
- Difference between two probability distributions

$$
\begin{align}
H(p, q) &= \mathbb{E}_p[\log\frac{1}{q(x)}] \\
     &= \sum_xp(x)\log\frac{1}{q(x)} \\
\end{align}
$$

- If p and q are the same distribution
$$
\begin{align}
H(p, p) &= \mathbb{E}_p[\log\frac{1}{p(x)}] \\
     &= \sum_xp(x)\log\frac{1}{p(x)} \\
     &= H(p) \\
\end{align}
$$

- With the concept of Kullback-Liebler divergence,
- The cross-entropy is the sum of the entropy and KL-divergence
$$
\begin{align}
H(p, q) &= \mathbb{E}_p[\log\frac{1}{q(x)}] \\
     &= \sum_xp(x)\log\frac{1}{q(x)} \\
     &= \sum_xp(x)\log\frac{\red{p(x)}}{q(x)}\frac{1}{\red{p(x)}} \\
     &= \sum_xp(x)\left(\log\frac{p(x)}{q(x)}+\log\frac{1}{p(x)}\right) \\
     &= D_{KL}(p(x)||q(x)) + H(p) \\
D_{KL}(p(x)||q(x)) &= \mathbb{E}_p[\log\frac{p(x)}{q(x)}] \\
\end{align}
$$