In [8]:
import numpy as np

The Hellinger distance is a measure of similarity between two probability distributions. It is based on the squared Euclidean distance between the normalized probability vectors of the two distributions. The Hellinger distance is sometimes referred to as the Bhattacharyya distance, and is used to compare two probability distributions in a variety of applications, including clustering, classification and pattern recognition.

$$ \mathit{D(P,Q)} = \frac{1}{\sqrt{2}} \sqrt{\sum_{i} \left(\sqrt{p_i} - \sqrt{q_i}\right)^2} $$



where $P$ and $Q$ are two probability distributions, $p_i$ and $q_i$ are the ith elements of each distribution, and $\mathit{D(P,Q)}$ is the Hellinger's Distance.

In [9]:
def H(p, q):
    # Hellingers distance
    total = np.sum((np.sqrt(p) - np.sqrt(q))**2)
    return (1.0 / np.sqrt(2.0)) * np.sqrt(total)

In [10]:
# The sample probabilities
p = np.array([0.36, 0.48, 0.16], dtype=np.float32)
q = np.array([0.30, 0.50, 0.20], dtype=np.float32)

In [11]:
h_pq = H(p, q)
h_qp = H(q, p)

In [12]:
print("Hellinger's distance from formula")
print(f"H(P,Q) dist = {np.around(h_pq, 5)}")
print(f"H(Q,P) dist = {np.around(h_qp, 5)}")

Hellinger's distance from formula
H(P,Q) dist = 0.05082
H(Q,P) dist = 0.05082
