In [1]:
import numpy as np
import tensorflow as tf

From Section 9 of http://ai.stanford.edu/~jduchi/projects/general_notes.pdf :


$D_{KL}\Big(P_1 \parallel P_2 \Big) = \{\dots\} = \frac{1}{2}\Big( \log \frac{\det\Sigma_2}{\det\Sigma_1} - n + tr{\Sigma^{-1}_2\Sigma_1}  + (\mu_2 - \mu_1) \Sigma^{-1}_2 (\mu_2 - \mu_1) \Big)$


$\log{\frac{\det\Sigma_2}{\det\Sigma_1}} = \sum_{i=1}^n{\big[\log{\sigma_{2i} - \log{\sigma_{1i}}} \big]}$

$\Sigma_2^{-1} =  diag{(\frac{1}{\sigma_{21}}, \dots, \frac{1}{\sigma_{2n}})}$

$ tr{\Sigma^{-1}_2\Sigma_1}  = \sum_{i=1}^n{\frac{\sigma_{1i}}{\sigma_{2i}}}$

$(\mu_2 - \mu_1) \Sigma^{-1}_2 (\mu_2 - \mu_1) = \sum_{i=1}^n{\frac{(\mu_{2i}-\mu_{1i})^2}{\sigma_{2i}}}$


In [26]:
def gaussian_kl_divergence_np(mu1, ln_var1, mu2, ln_var2): 
    
    shape = mu1.shape
    
    batch_size = shape[0]
    n = shape[1]
    
    log_var_diff = ln_var2 - ln_var1
    
    var_diff_trace = np.sum(np.exp(log_var_diff), axis=1)
    
    mudiff = np.sum(np.square(mu1-mu2) / np.exp(ln_var2), axis=1)
    
    kl_divs = 0.5*(np.sum(log_var_diff, axis=1) - n + var_diff_trace + mudiff)
    
    return np.sum(kl_divs) / batch_size


def gaussian_kl_divergence_tf(mu1, ln_var1, mu2, ln_var2): 
    
    shape = tf.to_float(tf.shape(mu1))
    
    batch_size = shape[0]
    n = shape[1]
    
    log_var_diff = ln_var2 - ln_var1
    
    var_diff_trace = tf.reduce_sum(tf.exp(log_var_diff), axis=1)
    
    mudiff = tf.reduce_sum(tf.square(mu1-mu2) / tf.exp(ln_var2), axis=1)
    
    kl_div = 0.5*(tf.reduce_sum(log_var_diff, axis=1) - n + var_diff_trace + mudiff)
    
    return tf.reduce_sum(kl_div) / batch_size

#### Sanity checks

$ p_1(x) \sim \mathcal{N}(0, 1)$


In [32]:
mu1_tf = tf.placeholder(tf.float32, shape=[None,3])
ln_var1_tf = tf.placeholder(tf.float32, shape=[None,3])
mu2_tf = tf.placeholder(tf.float32, shape=[None,3])
ln_var2_tf = tf.placeholder(tf.float32, shape=[None,3])

kl_div_tf = gaussian_kl_divergence_tf(mu1_tf, ln_var1_tf, mu2_tf, ln_var2_tf)

In [56]:
n = 15
k = 3
mu1 = np.zeros([n,k])
ln_var1 = np.ones([n,k])

$D_{KL}\Big(P_1 \parallel P_1 \Big) $ = 0

In [57]:
with tf.Session() as sess:
    print("KL (numpy): %f" % gaussian_kl_divergence_np(mu1, ln_var1, mu1, ln_var1))
    feed_dict = {mu1_tf: mu1, ln_var1_tf: ln_var1, mu2_tf: mu1, ln_var2_tf: ln_var1}
    print("KL (tensorflow): %f" % kl_div_tf.eval(feed_dict=feed_dict))

KL (numpy): 0.000000
KL (tensorflow): 0.000000


$ p_2(x) \sim \mathcal{N}(10, 1)$

$ p_3(x) \sim \mathcal{N}(5, 10)$

In [59]:
mu2 = np.zeros([n,k])+10
ln_var2 = np.ones([n,k])
mu3 = np.zeros([n,k])+5
ln_var3 = np.ones([n,k])*np.log(10)

$D_{KL}\Big(P_1 \parallel P_2 \Big)  > D_{KL}\Big(P_1 \parallel P_3 \Big)$

In [60]:
with tf.Session() as sess:
    print("KL (numpy): %f" % gaussian_kl_divergence_np(mu1, ln_var1, mu2, ln_var2))
    feed_dict = {mu1_tf: mu1, ln_var1_tf: ln_var1, mu2_tf: mu2, ln_var2_tf: ln_var2}
    print("KL (tensorflow): %f" % kl_div_tf.eval(feed_dict=feed_dict))

KL (numpy): 55.181916
KL (tensorflow): 55.181919


In [61]:
with tf.Session() as sess:
    print("KL (numpy): %f" % gaussian_kl_divergence_np(mu1, ln_var1, mu3, ln_var3))
    feed_dict = {mu1_tf: mu1, ln_var1_tf: ln_var1, mu2_tf: mu3, ln_var2_tf: ln_var3}
    print("KL (tensorflow): %f" % kl_div_tf.eval(feed_dict=feed_dict))

KL (numpy): 9.722069
KL (tensorflow): 9.722071


In [65]:
mu4 = np.zeros([n,k])+0.1
ln_var4 = np.ones([n,k])

In [66]:
with tf.Session() as sess:
    print("KL (numpy): %f" % gaussian_kl_divergence_np(mu1, ln_var1, mu4, ln_var4))
    feed_dict = {mu1_tf: mu1, ln_var1_tf: ln_var1, mu2_tf: mu4, ln_var2_tf: ln_var4}
    print("KL (tensorflow): %f" % kl_div_tf.eval(feed_dict=feed_dict))

KL (numpy): 0.005518
KL (tensorflow): 0.005518


In [67]:
with tf.Session() as sess:
    feed_dict = {mu1_tf: mu1, ln_var1_tf: ln_var1, mu2_tf: mu4, ln_var2_tf: ln_var4}
    kl = kl_div_tf.eval(feed_dict=feed_dict)