This notebook implements the GP Latent Variable Model as introduced in (Titsias and Lawrence, Bayesian Gaussian Process Latent Variable Model, 2010, http://proceedings.mlr.press/v9/titsias10a/titsias10a.pdf) paper.

Main difference compared to previous notebook lies in switching to tensorflow to perform model training / optimizing variational bound on the log-likelihood.

In [None]:
import numpy as np
import scipy as sp
import tensorflow as tf
import math
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

## Creating a 2D signal

For simplicity I generate the signal over 2D domain using ARD kernel (it is used as a default kernel for variational distribution in the base paper)

In [None]:
SIGMA_OBS = 0.1
ARD_KERNEL_SIGMA = 1
ARD_WEIGHT_0 = 2
ARD_WEIGHT_1 = 10

In [None]:
x0 = np.random.randn(1000)
x1 = np.random.randn(1000)

x0_matrix = np.tile(x0, len(x0)).reshape((len(x0), len(x0)))
x0_matrixT = x0_matrix.transpose()
x1_matrix = np.tile(x1, len(x1)).reshape((len(x1), len(x1)))
x1_matrixT = x1_matrix.transpose()

y_K = np.exp(- ARD_WEIGHT_0 * (x0_matrix - x0_matrixT) ** 2 / 2 
             - ARD_WEIGHT_1 * (x1_matrix - x1_matrixT) ** 2 / 2 ) * ARD_KERNEL_SIGMA ** 2
while True:
    try:
        np.linalg.cholesky(y_K)
        break
    except:
        y_K += 0.0001 * np.eye(len(x0))

y = np.dot(np.linalg.cholesky(y_K), np.random.randn(len(x0)).reshape(-1, 1)) + SIGMA_OBS * np.random.randn(len(x0)).reshape(-1, 1)

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
surf = ax.plot_trisurf(list(x0), list(x1), list(y.reshape(-1)), cmap=cm.jet, linewidth=0.1)
fig.colorbar(surf, shrink=0.5, aspect=5)

plt.title("Observed signal")
plt.show()

## Defining Latent Variable Model

Setting the environment

In [None]:
tf.reset_default_graph()

In [None]:
N_OBS = 1000
N_LATENT_VARIABLES = 25
N_LATENT_DIMS = 2

In [None]:
mu_variational = tf.Variable(tf.random_normal([N_OBS, N_LATENT_DIMS]))
var_variational = tf.Variable(tf.random_normal([N_OBS, N_LATENT_DIMS])) ** 2
latent_variables = tf.Variable(tf.random_normal([N_LATENT_VARIABLES, N_LATENT_DIMS]))
y_obs = tf.constant(y, shape=(N_OBS, 1), dtype=tf.float32)

In [None]:
beta = tf.Variable(1.0)
ard_sigma = tf.Variable(1.0)
ard_weights = tf.Variable([1.0, 1.0])

Calculating main auxilliary variables 

In [None]:
# K table has [N_LATENT_VARIABLES, N_LATENT_VARIABLES] size
# During the construction of it I need to reduce over N_LATENT_VARIABLES
# Expanding all variables to meet [N_LATENT_VARIABLES, N_LATENT_DIMS, N_LATENT_VARIABLES] format
K_left_latent = tf.tile(tf.expand_dims(latent_variables, axis=-1), [1, 1, N_LATENT_VARIABLES])
K_right_latent = tf.tile(tf.expand_dims(tf.transpose(latent_variables), axis=0), [N_LATENT_VARIABLES, 1, 1])
K_expanded_weights = tf.reshape(ard_weights, shape=[1, N_LATENT_DIMS, 1])

K_diff = tf.reduce_sum(K_expanded_weights * (K_left_latent - K_right_latent) ** 2, axis=1)
K = ard_sigma ** 2 * tf.exp(-0.5 * K_diff)

print K.shape

In [None]:
# Defining psi_0 (which is just a constant)
psi_0 = N_OBS * ard_sigma ** 2

In [None]:
# The psi_1 table has [N_OBS, N_LATENT_VARIABLES] size
# Expanding all variables to meet [N_OBS, N_LATENT_VARIABLES, N_LATENT_DIMS] format
psi_1_tiled_mu = tf.tile(tf.expand_dims(mu_variational, axis=1), [1, N_LATENT_VARIABLES, 1])
psi_1_tiled_var = tf.tile(tf.expand_dims(var_variational, axis=1), [1, N_LATENT_VARIABLES, 1])
psi_1_tiled_latent = tf.tile(tf.expand_dims(latent_variables, axis=0), [N_OBS, 1, 1])
psi_1_expanded_weights = tf.reshape(ard_weights, shape=[1, 1, N_LATENT_DIMS])

# Calculating psi_1 matrix - note that to meet the format I need to multiply over N_LATENT_DIMS
psi_1_norm = psi_1_tiled_var * psi_1_expanded_weights + 1
psi_1_diff = -0.5 * psi_1_expanded_weights * (psi_1_tiled_mu - psi_1_tiled_latent) ** 2 / psi_1_norm
psi_1 = ard_sigma ** 2 * tf.reduce_prod(tf.exp(psi_1_diff) / tf.sqrt(psi_1_norm), axis=2)
print psi_1.shape

In [None]:
# The psi_2 table has [N_LATENT_VARIABLES, N_LATENT_VARIABLES] size
# During the construction of psi_2 we need to reduce over N_OBS and N_LATENT_VARIABLES
# Expanding all variables to meet [N_LATENT_VARIABLES, N_OBS, N_LATENT_DIMS, N_LATENT_VARIABLES] format
psi_2_left_latent = tf.tile(tf.expand_dims(tf.expand_dims(latent_variables, axis=1), axis=-1), [1, N_OBS, 1, N_LATENT_VARIABLES])
psi_2_right_latent = tf.tile(tf.expand_dims(tf.expand_dims(tf.transpose(latent_variables), axis=0), axis=0), [N_LATENT_VARIABLES, N_OBS, 1, 1])
psi_2_tiled_mu = tf.tile(tf.expand_dims(tf.expand_dims(mu_variational, axis=0), axis=-1), [N_LATENT_VARIABLES, 1, 1, N_LATENT_VARIABLES])
psi_2_tiled_var = tf.tile(tf.expand_dims(tf.expand_dims(var_variational, axis=0), axis=-1), [N_LATENT_VARIABLES, 1, 1, N_LATENT_VARIABLES])
psi_2_expanded_weights = tf.reshape(ard_weights, shape=[1, 1, N_LATENT_DIMS, 1])

# Calculating psi_2 matrix - note that to meet the format I need to multiply over N_LATENT_DIMS and then sum over N_OBS
psi_2_norm = 2 * psi_2_tiled_var * psi_2_expanded_weights + 1
psi_2_diff = (-0.25 * psi_2_expanded_weights * (psi_2_left_latent - psi_2_right_latent) ** 2 
              - psi_2_expanded_weights * (psi_2_tiled_mu - 0.5 * psi_2_left_latent - 0.5 * psi_2_right_latent) ** 2 / psi_2_norm)
psi_2 = ard_sigma ** 4 * tf.reduce_sum(tf.reduce_prod(tf.exp(psi_2_diff) / tf.sqrt(psi_2_norm), axis=2), axis=1)
print psi_2.shape

Defining the variational bound on log-likelihood (with throwing away constants)

In [None]:
trace_term = 0.5 * beta * tf.trace(tf.matmul(tf.matrix_inverse(K), psi_2)) - 0.5 * beta * psi_0

W = beta - beta ** 2 * tf.matmul(tf.matmul(psi_1, tf.matrix_inverse(beta * psi_2 + K)), tf.transpose(psi_1))
log_term = (-0.5 * tf.matmul(tf.matmul(tf.transpose(y_obs), W), y_obs) 
            + 0.5 * N_OBS * tf.log(beta) 
            + 0.5 * tf.log(tf.matrix_determinant(K))
            - 0.5 * tf.log(tf.matrix_determinant(beta * psi_2 + K))
           )

# kl_term calculated separately for each component of variational posterior against standard normal distribution
# (standard normal is a prior, separate estimation goes after mean-field approach for variational Bayes)
# formula used: https://stats.stackexchange.com/questions/7440/kl-divergence-between-two-univariate-gaussians
kl_term = tf.reduce_sum(-tf.log(tf.sqrt(var_variational)) + 0.5 * var_variational + 0.5 * mu_variational ** 2 - 0.5)

variational_bound = trace_term + log_term + kl_term
grad_step = tf.train.AdamOptimizer(1e-2).minimize(-variational_bound)

## Training the model

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
N_GRAD_STEPS = 100
for _ in range(N_GRAD_STEPS):
    _, cur_bound, cur_beta, cur_sigma, cur_weights = sess.run([grad_step, variational_bound, beta, ard_sigma, ard_weights])
    print cur_bound, cur_beta, cur_sigma, cur_weights