In [11]:
from VariationalBayes import VectorParam, ScalarParam, PosDefMatrixParam, ModelParamsDict
from autograd import grad, hessian, jacobian, hessian_vector_product
import math
import autograd.numpy as np
import autograd.numpy.random as npr
import copy
import scipy
from scipy import optimize
from scipy import stats

In [7]:
# Build an object to contain a variational approximation to a K-dimensional multivariate normal.

K = 30
mvn_par = ModelParamsDict()

mvn_par.push_param(VectorParam('e_mu', K))
mvn_par.push_param(VectorParam('var_mu', K, lb=0))

mvn_par['e_mu'].set(np.full(K, 0.1))
mvn_par['var_mu'].set(np.full(K, 2.))



0.622459331202
[ 0.95257413  0.98201379]


array([-0.01814993, -0.04858735])

In [None]:
@primitive
def Logistic(rho):
    exp_rho = np.exp(rho) 
    return exp_rho / (1 + exp_rho)

@primitive
def LogisticGradient(logit_rho):
    return logit_rho * (1 - logit_rho)

def LogisticHessian(logit_rho_gradient, logit_rho):
    return logit_rho_gradient * (1 - 2 * logit_rho)

def Logistic_vjp(g, ans, vs, gvs, x):
    return np.full(x.shape, g) * LogisticGradient(ans)

def LogisticGradient_vjp(g, ans, vs, gvs, x):
    return np.full(x.shape, g) * LogisticGradient(ans)

print Logistic(0.5)
print Logistic(np.array([3., 4.]))

# From Stan:
#     inline double log1m_inv_logit(double u) {
#       using std::exp;
#       if (u > 0.0)
#         return -u - log1p(exp(-u));  // prevent underflow
#       return -log1p(exp(u));
#     }

def Log1mInvLogit(u):
    return -np.log1p(np.exp(-u))
    
Log1mInvLogit(5.0)
Log1mInvLogit(np.array([4., 3.]))


In [8]:
# Generate data

N = 20000
true_mu = np.random.rand(K).T - 0.5
x_mat = np.full([N, K], float('nan'))
y_vec = np.full([N], float('nan'))
for n in range(N):
    x_mat[n, :] = np.random.random(K) - 0.5
    y_vec[n] = np.random.random(1) < Logistic(np.dot(x_mat[n, :], true_mu))


In [12]:
# Define the variational objective
def LogLikelihood(x_row, y, e_mu, mu_var, std_draws):
    # logit(rho) is the probability of y being 1, which has a normal distribution under q().
    rho_mean = np.dot(x_row, e_mu)
    rho_sd = np.sqrt(np.sum(x_row * x_row * mu_var))
    # e_log_1mrho = np.mean([ Log1mInvLogit(std_draw * rho_sd + rho_mean) for std_draw in std_draws ])
    e_log_1mrho = np.mean(Log1mInvLogit(std_draws * rho_sd + rho_mean))
    return y * rho_mean + e_log_1mrho


def UnivariateNormalExpectedEntropy(var_mu):
    return 0.5 * np.log(var_mu)


def Elbo(y_vec, x_mat, mvn_par_elbo, num_draws=10):
    var_mu = mvn_par_elbo['var_mu'].get()
    e_mu = mvn_par_elbo['e_mu'].get()

    num_draws = 10
    draw_spacing = 1 / float(num_draws + 1)
    target_quantiles = np.linspace(draw_spacing, 1 - draw_spacing, num_draws)
    std_draws = scipy.stats.norm.ppf(target_quantiles)

    assert y_vec.size == x_mat.shape[0]
    assert e_mu.size == x_mat.shape[1]

    ll = 0
    for n in range(y_vec.size):
        ll += LogLikelihood(x_mat[n, :], y_vec[n], e_mu, var_mu, std_draws)

    entropy = sum([ UnivariateNormalExpectedEntropy(var_mu_k) for var_mu_k in var_mu])

    return ll + entropy


class KLWrapper():
    def __init__(self, mvn_par, x_mat, y_vec, num_draws):
        self.__mvn_par_ad = copy.deepcopy(mvn_par)
        self.x_mat = x_mat
        self.y_vec = y_vec
        self.num_draws = num_draws
        
    def Eval(self, free_par_vec, verbose=False):
        self.__mvn_par_ad.set_free(free_par_vec)
        kl = -Elbo(self.y_vec, self.x_mat, self.__mvn_par_ad, num_draws=self.num_draws)
        if verbose: print kl
        return kl
    
    # Return a posterior moment of interest as a function of
    # unconstrained parameters.  In this case it is a bit silly,
    # but in full generality posterior moments may be a complicated
    # function of moment parameters.
    def GetMu(self, free_par_vec):
        self.__mvn_par_ad.set_free(free_par_vec)
        return self.__mvn_par_ad['e_mu'].get()

    
kl_wrapper = KLWrapper(mvn_par, x_mat, y_vec, 10)
KLGrad = grad(kl_wrapper.Eval)
KLHess = hessian(kl_wrapper.Eval)
MomentJacobian = jacobian(kl_wrapper.GetMu)
KLHessVecProd = hessian_vector_product(kl_wrapper.Eval)  

In [13]:
# Check that the AD functions are working:
mvn_par['e_mu'].set(true_mu)
mvn_par['var_mu'].set(np.abs(true_mu) * 0.1)
free_par_vec = mvn_par.get_free()
print kl_wrapper.Eval(free_par_vec)
if K < 10:
    print KLGrad(free_par_vec)
    print KLHess(free_par_vec)
    print MomentJacobian(free_par_vec)
    print KLHessVecProd(free_par_vec, free_par_vec + 1)

57330.7052117


In [14]:
import timeit

time_num = 10

print 'Function time:'
print timeit.timeit(lambda: kl_wrapper.Eval(free_par_vec), number=time_num) / time_num

print 'Grad time:'
print timeit.timeit(lambda: KLGrad(free_par_vec), number=time_num) / time_num

print 'Hessian vector product time:'
print timeit.timeit(lambda: KLHessVecProd(free_par_vec, free_par_vec + 1), number=time_num) / time_num

if K < 10:
    print 'Hessian time:'
    print timeit.timeit(lambda: KLHess(free_par_vec), number=time_num) / time_num


Function time:
0.469360494614
Grad time:
7.69713962078
Hessian vector product time:
20.4013067961


In [None]:
# Set initial values.

# Is there not a better way than reduce?
true_means = reduce(lambda x, y: x + y, x_draws) / N

mvn_par['e_mu'].set(np.full(K, 1.0))
init_par_vec = mvn_par.get_free()

In [None]:
# Optimize.

print 'Running BFGS'
vb_opt_bfgs = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True), init_par_vec,
    method='bfgs', jac=KLGrad, tol=1e-6)
print 'Running Newton Trust Region'
vb_opt = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True),
    vb_opt_bfgs.x, method='trust-ncg', jac=KLGrad, hess=KLHess)
mvn_par_opt = copy.deepcopy(mvn_par)
mvn_par_opt.set_free(vb_opt.x)
print 'Done.'

In [None]:
# The mean parameters match, as expected.
print mvn_par_opt['e_mu']
print true_means

In [None]:
# LRVB
moment_jac = MomentJacobian(vb_opt.x)
opt_hess = KLHess(vb_opt.x)
mu_cov = np.matmul(moment_jac, np.linalg.solve(opt_hess, moment_jac.T))

# The VB variance is underestimated.
print np.diag(mu_cov)
print mvn_par_opt['var_mu']