In [1]:
from VariationalBayes import VectorParam, ScalarParam, PosDefMatrixParam, ModelParamsDict
import math

from autograd import grad, hessian, jacobian, hessian_vector_product
from autograd.core import primitive
from autograd.numpy.numpy_grads import unbroadcast

import autograd.numpy as np
import autograd.numpy.random as npr

import copy
import scipy
from scipy import optimize
from scipy import stats

In [2]:
# Build an object to contain a variational approximation to a K-dimensional multivariate normal.

K = 5
mvn_par = ModelParamsDict()

mvn_par.push_param(VectorParam('e_mu', K))
mvn_par.push_param(VectorParam('var_mu', K, lb=0))

mvn_par['e_mu'].set(np.full(K, 0.1))
mvn_par['var_mu'].set(np.full(K, 2.))



In [8]:
# Log1mInvLogit as a function of u, but with cacheing through the second and third arguments.

@primitive
def Log1mInvLogit(u, exp_u, logit_u):
    return -np.log1p(exp_u)
    
@primitive
def Log1mInvLogitDerivative(u, exp_u, logit_u):
    return -logit_u

@primitive
def Log1mInvLogitSecondDerivative(u, exp_u, logit_u):
    return -logit_u * (1 - logit_u)



    
    

In [9]:
# Generate data

N = 2000
true_mu = np.random.rand(K).T - 0.5
x_mat = np.full([N, K], float('nan'))
y_vec = np.full([N], float('nan'))
for n in range(N):
    x_mat[n, :] = np.random.random(K) - 0.5
    y_vec[n] = np.random.random(1) < Logistic(np.dot(x_mat[n, :], true_mu))


In [14]:
# Define the variational objective
def LogLikelihood(x_row, y, e_mu, mu_var, std_draws):
    # logit(rho) is the probability of y being 1, which has a normal distribution under q().
    rho_mean = np.dot(x_row, e_mu)
    rho_sd = np.sqrt(np.sum(x_row * x_row * mu_var))
    e_log_1mrho = 0.
    for std_draw in std_draws:
        e_log_1mrho += Log1mInvLogit(std_draw * rho_sd + rho_mean)
    e_log_1mrho /= len(std_draws)
    # e_log_1mrho = np.mean(Log1mInvLogit(std_draws * rho_sd + rho_mean))
    return y * rho_mean + e_log_1mrho


def UnivariateNormalExpectedEntropy(var_mu):
    return 0.5 * np.log(var_mu)


def Elbo(y_vec, x_mat, mvn_par_elbo, num_draws=10):
    var_mu = mvn_par_elbo['var_mu'].get()
    e_mu = mvn_par_elbo['e_mu'].get()

    num_draws = 10
    draw_spacing = 1 / float(num_draws + 1)
    target_quantiles = np.linspace(draw_spacing, 1 - draw_spacing, num_draws)
    std_draws = scipy.stats.norm.ppf(target_quantiles)

    assert y_vec.size == x_mat.shape[0]
    assert e_mu.size == x_mat.shape[1]

    ll = 0
    for n in range(y_vec.size):
        ll += LogLikelihood(x_mat[n, :], y_vec[n], e_mu, var_mu, std_draws)

    entropy = sum([ UnivariateNormalExpectedEntropy(var_mu_k) for var_mu_k in var_mu])

    return ll + entropy


class KLWrapper():
    def __init__(self, mvn_par, x_mat, y_vec, num_draws):
        self.__mvn_par_ad = copy.deepcopy(mvn_par)
        self.x_mat = x_mat
        self.y_vec = y_vec
        self.num_draws = num_draws
        
    def Eval(self, free_par_vec, verbose=False):
        self.__mvn_par_ad.set_free(free_par_vec)
        kl = -Elbo(self.y_vec, self.x_mat, self.__mvn_par_ad, num_draws=self.num_draws)
        if verbose: print kl
        return kl
    
    # Return a posterior moment of interest as a function of
    # unconstrained parameters.  In this case it is a bit silly,
    # but in full generality posterior moments may be a complicated
    # function of moment parameters.
    def GetMu(self, free_par_vec):
        self.__mvn_par_ad.set_free(free_par_vec)
        return self.__mvn_par_ad['e_mu'].get()

    
kl_wrapper = KLWrapper(mvn_par, x_mat, y_vec, 10)
KLGrad = grad(kl_wrapper.Eval)
KLHess = hessian(kl_wrapper.Eval)
MomentJacobian = jacobian(kl_wrapper.GetMu)
KLHessVecProd = hessian_vector_product(kl_wrapper.Eval)  

In [15]:
# Check that the AD functions are working:
mvn_par['e_mu'].set(true_mu)
mvn_par['var_mu'].set(np.abs(true_mu) * 0.1)
free_par_vec = mvn_par.get_free()
print kl_wrapper.Eval(free_par_vec)
if K < 10:
    print KLGrad(free_par_vec)
    print KLHess(free_par_vec)
    print MomentJacobian(free_par_vec)
    print KLHessVecProd(free_par_vec, free_par_vec + 1)

2579.63452921
[-115.07877279  -85.3541339   -89.38439757  -77.76775508 -103.60087118
    6.69562197   10.34572124   10.86197333    4.49925244    9.70880184]
[[  2.04679907e+01  -4.13480773e+00  -3.62714885e+00  -4.67434094e+00
   -3.62158347e+00   6.37240467e-01   4.27846693e-01   4.79172288e-01
    1.74230068e-01   3.79573780e-01]
 [ -4.13480773e+00   2.19003055e+01  -2.49240733e+00  -1.89805624e+00
   -1.75929790e+00   2.04668793e-01   6.97109712e-01   4.05351554e-01
    1.58815788e-01   2.97076745e-01]
 [ -3.62714885e+00  -2.49240733e+00   2.23301967e+01  -2.60297719e+00
   -2.37872576e+00   1.72397323e-01   3.08149750e-01   5.98881571e-01
    1.61331665e-01   2.93191353e-01]
 [ -4.67434094e+00  -1.89805624e+00  -2.60297719e+00   2.10269172e+01
   -2.99783040e+00   2.14655270e-01   3.93053296e-01   3.95711575e-01
    3.38140342e-01   3.22622279e-01]
 [ -3.62158347e+00  -1.75929790e+00  -2.37872576e+00  -2.99783040e+00
    2.16954031e+01   2.02499307e-01   3.72006603e-01   3.84069508

In [24]:
import timeit

time_num = 3

print 'Function time:'
print timeit.timeit(lambda: kl_wrapper.Eval(free_par_vec), number=time_num) / time_num

print 'Grad time:'
print timeit.timeit(lambda: KLGrad(free_par_vec), number=time_num) / time_num

print 'Hessian vector product time:'
print timeit.timeit(lambda: KLHessVecProd(free_par_vec, free_par_vec + 1), number=time_num) / time_num

if K < 10:
    print 'Hessian time:'
    print timeit.timeit(lambda: KLHess(free_par_vec), number=time_num) / time_num


Function time:
0.116110960642
Grad time:
3.5185303688
Hessian vector product time:
7.93109003703
Hessian time:
40.0598433812


In [18]:
import cProfile
cProfile.run('kl_wrapper.Eval(free_par_vec)', '/tmp/cprofilestats')
# cProfile.run('KLGrad(free_par_vec)', '/tmp/cprofilestats')

In [23]:
import pstats
p = pstats.Stats('/tmp/cprofilestats')
# p.strip_dirs().sort_stats('cumulative').print_stats(100)
p.strip_dirs().sort_stats('cumulative').print_callers(100)

   Ordered by: cumulative time
   List reduced from 170 to 100 due to restriction <100>

Function                                          was called by...
                                                      ncalls  tottime  cumtime
<string>:1(<module>)                              <-
convenience_wrappers.py:21(gradfun)               <-       1    0.000    8.578  <string>:1(<module>)
core.py:18(<lambda>)                              <-       1    0.047    4.564  convenience_wrappers.py:21(gradfun)
core.py:31(backward_pass)                         <-       1    0.628    4.517  core.py:18(<lambda>)
core.py:59(__call__)                              <-    6000    0.023    0.066  <ipython-input-14-eb31a6dbdeac>:2(LogLikelihood)
                                                           5    0.000    0.000  <ipython-input-14-eb31a6dbdeac>:14(UnivariateNormalExpectedEntropy)
                                                           7    0.000    0.000  <ipython-input-14-eb31a6dbdeac>:18(El

<pstats.Stats instance at 0x7fde598b87a0>

In [None]:
# Set initial values.

# Is there not a better way than reduce?
true_means = reduce(lambda x, y: x + y, x_draws) / N

mvn_par['e_mu'].set(np.full(K, 1.0))
init_par_vec = mvn_par.get_free()

In [None]:
# Optimize.

print 'Running BFGS'
vb_opt_bfgs = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True), init_par_vec,
    method='bfgs', jac=KLGrad, tol=1e-6)
print 'Running Newton Trust Region'
vb_opt = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True),
    vb_opt_bfgs.x, method='trust-ncg', jac=KLGrad, hess=KLHess)
mvn_par_opt = copy.deepcopy(mvn_par)
mvn_par_opt.set_free(vb_opt.x)
print 'Done.'

In [None]:
# The mean parameters match, as expected.
print mvn_par_opt['e_mu']
print true_means

In [None]:
# LRVB
moment_jac = MomentJacobian(vb_opt.x)
opt_hess = KLHess(vb_opt.x)
mu_cov = np.matmul(moment_jac, np.linalg.solve(opt_hess, moment_jac.T))

# The VB variance is underestimated.
print np.diag(mu_cov)
print mvn_par_opt['var_mu']