In [1]:
from VariationalBayes.Models.LinearMixedModel.ConjugateModel import \
    get_base_prior_parameters, get_base_parameters, get_base_moment_parameters, set_moments, \
    LMMDataCache, CoordinateAscentUpdater, KLWrapper, MomentWrapper, ASISCoordinateAscent

from autograd import grad, hessian, jacobian, hessian_vector_product
import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy as asp
import scipy as sp

import copy
from scipy import optimize

import time

In [2]:
# Load data saved by stan_results_to_json.R and run_stan.R in LRVBLogitGLMM.
import os
import json

# Simulate data
N = 100     # observations per group
K = 4      # dimension of regressors
NG = 300      # number of groups

# Generate data
NObs = NG * N
true_beta = np.array(range(K))
true_beta = true_beta - np.mean(true_beta)
true_y_info = 1.0

true_mu = 1.0
true_mu_info = 4.0
true_u_sufficient = np.random.normal(true_mu, 1 / np.sqrt(true_mu_info), NG)
true_u_ancillary = true_u_sufficient - true_mu

x_mat = np.random.random(K * NObs).reshape(NObs, K) - 0.5
x_rot = np.full((K, K), 0.5)
for k in range(K):
    x_rot[k, k] = 1.0
x_mat = np.matmul(x_mat, x_rot)

y_g_vec = np.array([ g for g in range(NG) for n in range(N) ])
true_mean = np.matmul(x_mat, true_beta) + true_u_sufficient[y_g_vec]
y_vec = np.random.normal(true_mean, 1 / np.sqrt(true_y_info), NG * N)

print N * NG

30000


In [3]:
# Whether or not you are optimizing a sufficient or ancillary model.
sufficient = False

prior_par = get_base_prior_parameters(K)
lmm_par = get_base_parameters(K, NG, sufficient=sufficient)
moment_par = get_base_moment_parameters(K, NG, sufficient=sufficient)
data_cache = LMMDataCache(x_mat, y_vec, y_g_vec)

set_moments(lmm_par, moment_par)
init_moment_par = copy.deepcopy(moment_par)
init_par_vec = lmm_par.get_free()

moment_indices = copy.deepcopy(moment_par)
moment_indices.set_vector(np.array(range(moment_indices.vector_size())))

In [4]:
# Perform coordinate ascent.
ca_updater = CoordinateAscentUpdater(init_moment_par, data_cache, prior_par, sufficient=sufficient)

ca_time = time.time()
tol = 1e-9
delta = float('inf')
i = 0
max_iters = 5000
while delta > tol and i < max_iters:
    i += 1
    delta = ca_updater.update()
    if i % 5 == 0:
        print 'Iter %d, delta %f' % (i, delta)

if i < max_iters:
    ca_time = time.time() - ca_time
else:
    print 'Coordinate ascent did not converge.'
    ca_time = float('inf')

ca_moment_par_opt = copy.deepcopy(ca_updater.moment_par)

print 'Coordinate ascent time (in seconds): %f, iters: %d' % (ca_time, i)

Iter 5, delta 0.009902
Iter 10, delta 0.005716
Iter 15, delta 0.004646
Iter 20, delta 0.003777
Iter 25, delta 0.003070
Iter 30, delta 0.002496
Iter 35, delta 0.002029
Iter 40, delta 0.001649
Iter 45, delta 0.001340
Iter 50, delta 0.001090
Iter 55, delta 0.000886
Iter 60, delta 0.000720
Iter 65, delta 0.000585
Iter 70, delta 0.000476
Iter 75, delta 0.000387
Iter 80, delta 0.000314
Iter 85, delta 0.000256
Iter 90, delta 0.000208
Iter 95, delta 0.000169
Iter 100, delta 0.000137
Iter 105, delta 0.000112
Iter 110, delta 0.000091
Iter 115, delta 0.000074
Iter 120, delta 0.000060
Iter 125, delta 0.000049
Iter 130, delta 0.000040
Iter 135, delta 0.000032
Iter 140, delta 0.000026
Iter 145, delta 0.000021
Iter 150, delta 0.000017
Iter 155, delta 0.000014
Iter 160, delta 0.000011
Iter 165, delta 0.000009
Iter 170, delta 0.000008
Iter 175, delta 0.000006
Iter 180, delta 0.000005
Iter 185, delta 0.000004
Iter 190, delta 0.000003
Iter 195, delta 0.000003
Iter 200, delta 0.000002
Iter 205, delta 0.00

In [5]:
# Sanity check
asis_updater = ASISCoordinateAscent(moment_par, data_cache, prior_par)
asis_updater.update()

print asis_updater.ca_updater_sufficient.moment_par['e_u'].get()[0:10]
print asis_updater.ca_updater_ancillary.moment_par['e_u'].get()[0:10]
print asis_updater.ca_updater_sufficient.moment_par['e_mu'].get()
print asis_updater.ca_updater_ancillary.moment_par['e_mu'].get()

[ 0.35886896  0.70614818  0.94900817  1.65293124  1.10183254  1.16019296
  1.73666465  1.76863363  1.35841258  0.85290213]
[ 0.29811471  0.65621773  0.90669686  1.6324248   1.06420746  1.12434608
  1.71874874  1.75169979  1.32874683  0.80750303]
[ 0.97939297]
[-0.03420781]


In [6]:
asis_updater = ASISCoordinateAscent(moment_par, data_cache, prior_par)

asis_time = time.time()
tol = 1e-9
delta = float('inf')
i = 0
max_iters = 5000
while delta > tol and i < max_iters:
    i += 1
    delta = asis_updater.update()
    if i % 5 == 0:
        print 'Iter %d, delta %f' % (i, delta)

if i < max_iters:
    asis_time = time.time() - asis_time
else:
    print 'ASIS did not converge.'
    asis_time = float('inf')

asis_moment_par_opt = copy.deepcopy(asis_updater.ca_updater_sufficient.moment_par)

print 'ASIS ascent time (in seconds): %f, iters: %d' % (asis_time, i)


Iter 5, delta 72.354110
Iter 10, delta 5.871719
Iter 15, delta 0.479287
Iter 20, delta 0.039113
Iter 25, delta 0.003192
Iter 30, delta 0.000260
Iter 35, delta 0.000021
Iter 40, delta 0.000002
Iter 45, delta 0.000000
Iter 50, delta 0.000000
Iter 55, delta 0.000000
ASIS ascent time (in seconds): 1.116681, iters: 55


In [7]:
kl_wrapper = KLWrapper(lmm_par, moment_par, prior_par, x_mat, y_vec, y_g_vec, sufficient=sufficient)
print kl_wrapper.kl(init_par_vec)

import timeit

time_num = 10

print 'Function time:'
print timeit.timeit(lambda: kl_wrapper.kl(init_par_vec), number=time_num) / time_num

print 'Grad time:'
print timeit.timeit(lambda: kl_wrapper.kl_grad(init_par_vec), number=time_num) / time_num

print 'Hessian vector product time:'
print timeit.timeit(lambda: kl_wrapper.kl_hvp(init_par_vec, init_par_vec + 1), number=time_num) / time_num

83782.7092914
Function time:
0.000552582740784
Grad time:
0.00740149021149
Hessian vector product time:
0.0141293048859


In [8]:
# Optimize.
vb_time = time.time()

print 'Running Newton Trust Region'
trust_init = copy.deepcopy(init_par_vec)
vb_opt = optimize.minimize(
    lambda par: kl_wrapper.kl(par, verbose=True),
    trust_init, method='trust-ncg', jac=kl_wrapper.kl_grad, hessp=kl_wrapper.kl_hvp,
    tol=1e-6, options={'maxiter': 500, 'disp': True, 'gtol': 1e-9 })

vb_time = time.time() - vb_time

lmm_par_opt = copy.deepcopy(lmm_par)
lmm_par_opt.set_free(vb_opt.x)

moment_par_opt = copy.deepcopy(moment_par)
set_moments(lmm_par_opt, moment_par_opt)

print 'trust region time (in seconds): %f' %  vb_time

Running Newton Trust Region
83782.7092914
57095.1018492
42342.5975137
34314.5682244
29521.8579457
27441.9162435
25232.9817797
25177.1793399
22615.0061669
22458.2822843
21646.4608607
21601.2740807
18126.4642266
17996.2333833
15809.4893033
15714.3025977
15697.5088795
15210.3859753
15207.3586398
15099.582485
15099.4146695
15086.6084903
15086.604903
15086.5557405
15086.1698211
15086.1619663
15085.5951002
15085.4105657
15085.3939714
15085.1083818
15084.9195026
15084.90731
15093.7581556
15084.5777475
15084.5465229
15084.5296709
15084.4880181
15084.4731365
15084.3276711
15084.1651925
15084.1511129
15084.0982322
15084.0824352
15084.0810147
15084.0619352
15084.0277804
15084.0277443
15084.0277423
15084.0259901
15083.9740427
15083.9739303
15083.9722122
15083.9716408
15083.9590642
15083.9590363
15083.9583751
15083.9583747
15083.9583739
15083.9583739
Optimization terminated successfully.
         Current function value: 15083.958374
         Iterations: 58
         Function evaluations: 59
        

In [9]:
# Make sure the coordinate ascent and trust region are close
moment_diff = np.sum(np.abs(ca_moment_par_opt.get_vector() - moment_par_opt.get_vector()))
moment_diff_asis = np.sum(np.abs(ca_moment_par_opt.get_vector() - asis_moment_par_opt.get_vector()))

print kl_wrapper.kl(vb_opt.x)

print 'total moment difference (ca vs asis): %f' % moment_diff_asis
print 'total moment difference (ca vs tr): %f' % moment_diff
print 'trust region seconds: %f' % vb_time
print 'coordinate ascent seconds: %f' % ca_time
print 'asis ascent seconds: %f' % asis_time

15083.9583739
total moment difference (ca vs asis): 490.620436
total moment difference (ca vs tr): 0.000000
trust region seconds: 15.932497
coordinate ascent seconds: 2.241893
asis ascent seconds: 1.116681


In [10]:
from VariationalBayes.Models.LinearMixedModel.ConjugateModel import get_elbo_model_term, get_entropy_from_moments
    
ca_entropy = get_entropy_from_moments(ca_moment_par_opt, lmm_par_guess=lmm_par_opt)
# The optima are essentially the same.
print ca_entropy + get_elbo_model_term(data_cache, ca_moment_par_opt, prior_par, sufficient=sufficient)
print -kl_wrapper.kl(vb_opt.x)

NameError: global name 'moment_par_opt' is not defined

In [None]:
# print(glmm_par_opt)
print '--------------- beta:\n'
print true_beta
print lmm_par_opt['beta'].e()

print '--------------- mu:\n'
print lmm_par_opt['mu'].e()[0]
print true_mu

print '--------------- mu info:\n'
print lmm_par_opt['mu_info'].e()[0]
print true_mu_info

print '--------------- y info:\n'
print lmm_par_opt['y_info'].e()[0]
print true_y_info


# Check the random effect estimates.  This requires simulated data.
from ggplot import *
import pandas as pd
%matplotlib inline

if sufficient:
    plot_df = pd.DataFrame({ 'opt': lmm_par_opt['u'].e(), 'true': true_u_sufficient })
else:
    plot_df = pd.DataFrame({ 'opt': lmm_par_opt['u'].e(), 'true': true_u_ancillary })
print ggplot(plot_df, aes(x='true', y='opt')) + geom_point() + geom_abline(slope=1, intercept=0)

plot_df = pd.DataFrame({ 'opt': lmm_par_opt['beta'].e(), 'true': true_beta })
print ggplot(plot_df, aes(x='true', y='opt')) + geom_point() + geom_abline(slope=1, intercept=0)


In [None]:
# print(glmm_par_opt)

print '--------------- mu:\n'
print ca_moment_par_opt['e_mu'].get()
print moment_par_opt['e_mu'].get()

print '--------------- mu info:\n'
print ca_moment_par_opt['e_mu_info'].get()
print moment_par_opt['e_mu_info'].get()

print '--------------- y info:\n'
print ca_moment_par_opt['e_y_info'].get()
print moment_par_opt['e_y_info'].get()


plot_df = pd.DataFrame({ 'tr_opt': moment_par_opt['e_u'].get(), 'ca_opt': ca_moment_par_opt['e_u'].get() })
print ggplot(plot_df, aes(x='tr_opt', y='ca_opt')) + geom_point() + geom_abline(slope=1, intercept=0)

plot_df = pd.DataFrame({ 'tr_opt': moment_par_opt['e_beta'].get(), 'ca_opt': ca_moment_par_opt['e_beta'].get() })
print ggplot(plot_df, aes(x='tr_opt', y='ca_opt')) + geom_point() + geom_abline(slope=1, intercept=0)


In [None]:
# Check differences if any

print moment_par_opt['e_mu_info']
print ca_moment_par_opt['e_mu_info']

print moment_par_opt['e_log_mu_info']
print ca_moment_par_opt['e_log_mu_info']

diffs = moment_par_opt.get_vector() - ca_moment_par_opt.get_vector()
print np.where(np.abs(diffs) > 1e-3)
print moment_indices

# plot_df = pd.DataFrame({ 'tr_opt': moment_par_opt['e_beta_outer'].get(), 'ca_opt': ca_moment_par_opt['e_beta_outer'].get() })
# print ggplot(plot_df, aes(x='tr_opt', y='ca_opt')) + geom_point() + geom_abline(slope=1, intercept=0)


In [None]:
# # Slow, but probably faster than using CG if you want the covariance of many parameters.
# hess_time = time.time()
# print 'Calculating KL Hessian...'
# kl_hess = kl_wrapper.kl_hess(vb_opt.x)
# hess_time =  time.time() - hess_time
# elbo_hess = -kl_hess
# print 'Done.'

# print 'Hessian time (seconds): %f' % hess_time

In [None]:
# hess_norm = np.sqrt(np.sum(kl_hess**2))
# diag_norm = np.sqrt(np.sum(np.diag(kl_hess)**2))

# # How diagonal is the Hessian?
# print 'Proportion of the frobenius norm off the diagonal:'
# print (hess_norm - diag_norm) / hess_norm