In [2]:
from VariationalBayes import ScalarParam, ModelParamsDict, VectorParam, PosDefMatrixParam
from VariationalBayes.NormalParams import UVNParam, UVNParamVector
from VariationalBayes.ExponentialFamilies import UnivariateNormalEntropy, UVNPrior

from autograd import grad, hessian, jacobian, hessian_vector_product
import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy as asp
import scipy as sp

import copy
from scipy import optimize

In [47]:
# Load data saved by stan_results_to_json.R and run_stan.R in LRVBLogitGLMM.
import os
import json

simulate_data = False
prior_par = ModelParamsDict('Prior Parameters')

# Simulate data
N = 200     # observations per group
K = 5      # dimension of regressors
NG = 200      # number of groups

# Generate data
NObs = NG * N
true_beta = np.array(range(5))
true_beta = true_beta - np.mean(true_beta)
true_y_info = 1.0

true_mu = 0.0
true_mu_info = 40.0
true_u = np.random.normal(true_mu, 1 / np.sqrt(true_mu_info), NG)

x_mat = np.random.random(K * NObs).reshape(NObs, K) - 0.5
y_g_vec = [ g for g in range(NG) for n in range(N) ]
true_mean = np.matmul(x_mat, true_beta) + true_u[y_g_vec]
print true_mean
y_vec = np.random.normal(true_mean, 1 / np.sqrt(true_y_info), NG * N)

prior_par.push_param(ScalarParam('beta_mean', val=0.0))
prior_par.push_param(ScalarParam('beta_info', val=0.01))

prior_par.push_param(ScalarParam('mu_mean', val=0.))
prior_par.push_param(ScalarParam('mu_info', val=0.5))

prior_par.push_param(ScalarParam('mu_log_info_mean', val=0.))
prior_par.push_param(ScalarParam('mu_log_info_info', val=10.0))

prior_par.push_param(ScalarParam('y_log_info_mean', val=0.0))
prior_par.push_param(ScalarParam('y_log_info_info', val=10.0))

print N * NG

[ 0.45290787  0.66319061 -1.71068699 ...,  0.10880838  0.79806391
  1.09614029]
40000


In [44]:
# Build an object to contain a variational approximation to a K-dimensional multivariate normal.

# ADVI-style, we will represent each parameter as a univariate normal.
# Note: you need to include the Jacobian of the transform because the log prior is not a density
# with respect to the transformed space.
lmm_par = ModelParamsDict('LMM Parameters')

lmm_par.push_param(UVNParamVector('beta', K))
lmm_par.push_param(UVNParam('mu'))
lmm_par.push_param(UVNParam('mu_log_info'))
lmm_par.push_param(UVNParam('y_log_info'))
lmm_par.push_param(UVNParamVector('u', NG))

lmm_par['beta'].mean.set(np.full(K, -0.2))
lmm_par['beta'].info.set(np.full(K, 1.0))

lmm_par['mu'].mean.set(0.2)
lmm_par['mu'].info.set(1.5)

lmm_par['mu_log_info'].mean.set(0.3)
lmm_par['mu_log_info'].info.set(1.1)

lmm_par['y_log_info'].mean.set(0.4)
lmm_par['y_log_info'].info.set(1.1)

lmm_par['u'].mean.set(np.full(NG, -0.1))
lmm_par['u'].info.set(np.full(NG, 0.8))

free_par_vec = lmm_par.get_free()

In [61]:
# A single draw from the variational distribution.
lmm_draw = ModelParamsDict('LMM Parameter draw')

lmm_draw.push_param(VectorParam('beta', K))
lmm_draw.push_param(ScalarParam('mu'))
lmm_draw.push_param(ScalarParam('mu_log_info'))
lmm_draw.push_param(ScalarParam('y_log_info'))
lmm_draw.push_param(VectorParam('u', NG))

def trans_normal_param(draw, par):
    return draw.get() / np.sqrt(par.info.get()) + par.mean.get()

# Set the draw object from the normal parameters in lmm_par by scaling and centering the
# standard normal vector std_normal_vec.
def set_draw(lmm_draw, lmm_par, std_normal_vec):
    lmm_draw.set_vector(std_normal_vec)

    lmm_draw['beta'].set(trans_normal_param(lmm_draw['beta'], lmm_par['beta']))
    lmm_draw['mu'].set(trans_normal_param(lmm_draw['mu'], lmm_par['mu']))
    lmm_draw['mu_log_info'].set(trans_normal_param(lmm_draw['mu_log_info'], lmm_par['mu_log_info']))
    lmm_draw['y_log_info'].set(trans_normal_param(lmm_draw['y_log_info'], lmm_par['y_log_info']))
    lmm_draw['u'].set(trans_normal_param(lmm_draw['u'], lmm_par['u']))

num_draws = 10
std_normal_mat = np.random.normal(size=(num_draws, lmm_draw.vector_size()))
std_normal_vec = std_normal_mat[0, :]
set_draw(lmm_draw, lmm_par, std_normal_vec)

# Sanity check
print np.mean(lmm_draw['u'].get())
print 1 / np.var(lmm_draw['u'].get())


-0.0832294522347
0.812207235754


In [29]:
# Define moment parameters
moment_par = ModelParamsDict('Moment Parameters')
moment_par.push_param(VectorParam('e_beta', K))
moment_par.push_param(ScalarParam('e_mu'))
moment_par.push_param(ScalarParam('e_mu_log_info'))
moment_par.push_param(ScalarParam('e_y_log_info'))
moment_par.push_param(VectorParam('e_u', NG))

def set_moments(lmm_par, moment_par):
    moment_par['e_beta'].set(lmm_par['beta'].e())
    moment_par['e_mu'].set(lmm_par['mu'].e())
    moment_par['e_mu_log_info'].set(lmm_par['mu_log_info'].e())
    moment_par['e_y_log_info'].set(lmm_par['y_log_info'].e())
    moment_par['e_u'].set(lmm_par['u'].e())
    
set_moments(lmm_par, moment_par)

# Moment indices.
moment_indices = copy.deepcopy(moment_par)
moment_indices.set_vector(1 + np.array(range(moment_indices.vector_size())))

In [50]:
def normal_log_prior(draw, prior_mean, prior_info):
    return -0.5 * prior_info * np.sum((draw - prior_mean) ** 2)

def LogPriorDraw(lmm_draw, prior_par):
    return \
        normal_log_prior(lmm_draw['beta'].get(),
                         prior_par['beta_mean'].get(),
                         prior_par['beta_info'].get()) + \
        normal_log_prior(lmm_draw['mu'].get(),
                         prior_par['mu_mean'].get(),
                         prior_par['mu_info'].get()) + \
        normal_log_prior(lmm_draw['mu_log_info'].get(),
                         prior_par['mu_log_info_mean'].get(),
                         prior_par['mu_log_info_info'].get()) + \
        normal_log_prior(lmm_draw['y_log_info'].get(),
                         prior_par['y_log_info_mean'].get(),
                         prior_par['y_log_info_info'].get())

LogPriorDraw(lmm_draw, prior_par)

-0.53996120470196751

In [65]:
def DataLogLikelihoodDraw(x_mat, y_vec, y_g_vec, draw):
    # TODO: this could be way faster by cacheing certain matrix calculations.  See notes.
    y_centered = y_vec - (np.matmul(x_mat, draw['beta'].get()) + draw['u'].get()[y_g_vec])
    y_log_info = draw['y_log_info'].get()[0]
    return -0.5 * np.exp(y_log_info) * np.dot(y_centered, y_centered) - 0.5 * len(y_vec) * y_log_info
    
DataLogLikelihood(x_mat, y_vec, y_g_vec, lmm_draw)

-165013.48768429295

In [66]:
def RandomEffectLogLikelihoodDraw(draw):
    u_center = draw['u'].get() - draw['mu'].get()
    mu_log_info = draw['mu_log_info'].get()[0]
    return -0.5 * np.exp(mu_log_info) * np.dot(u_center, u_center) - 0.5 * len(u_center) * mu_log_info

RandomEffectLogLikelihood(lmm_draw)

-55.497723797661912

In [92]:
def ELBODataTerm(y_vec, x_mat, y_g_vec, lmm_draw, prior_par):
    ll_data = DataLogLikelihoodDraw(x_mat, y_vec, y_g_vec, lmm_draw)
    if np.isnan(ll_data):
        print 'bad data log likelihood'
        return -np.inf

    ll_rf = RandomEffectLogLikelihoodDraw(lmm_draw)
    if np.isnan(ll_rf):
        print 'bad random effect log likelihood'
        return -np.inf

    e_log_prior = LogPriorDraw(lmm_draw, prior_par)
    if np.isnan(e_log_prior):
        print 'bad prior'
        return -np.inf

    return ll_data + ll_rf + e_log_prior

def ELBOEntropyTerm(lmm_par):
    return UnivariateNormalEntropy(lmm_par['beta'].info.get()) + \
           UnivariateNormalEntropy(lmm_par['mu'].info.get()) + \
           UnivariateNormalEntropy(lmm_par['u'].info.get()) + \
           UnivariateNormalEntropy(lmm_par['y_log_info'].info.get()) + \
           UnivariateNormalEntropy(lmm_par['mu_log_info'].info.get())

def ELBO(y_vec, x_mat, y_g_vec, lmm_par, lmm_draw, prior_par, std_normal_mat):
    entropy = ELBOEntropyTerm(lmm_par)
    
    data_term = 0.0
    num_draws = std_normal_mat.shape[0]
    for draw in range(num_draws):
        set_draw(lmm_draw, lmm_par, std_normal_mat[draw, :])
        data_term += ELBODataTerm(y_vec, x_mat, y_g_vec, lmm_draw, prior_par)

    data_term /= num_draws
    
    return data_term + entropy
        
class KLWrapper(object):
    def __init__(self, lmm_par, lmm_draw, prior_par, x_mat, y_vec, y_g_vec, num_draws):
        self.__lmm_par_ad = copy.deepcopy(lmm_par)
        self.__prior_par_ad = copy.deepcopy(prior_par)
        self.__lmm_draw_ad = copy.deepcopy(lmm_draw)
        self.x_mat = x_mat
        self.y_vec = y_vec
        self.y_g_vec = y_g_vec
        self.randomize(num_draws)

    def randomize(self, num_draws):
        self.std_normal_mat = np.random.normal(size=(num_draws, self.__lmm_draw_ad.vector_size()))
        
    def KL(self, free_par_vec, verbose=False):
        self.__lmm_par_ad.set_free(free_par_vec)
        kl = -ELBO(self.y_vec, self.x_mat, self.y_g_vec,
                   self.__lmm_par_ad, self.__lmm_draw_ad,
                   self.__prior_par_ad, self.std_normal_mat)
        if verbose: print kl

        return kl
        

class MomentWrapper(object):
    def __init__(self, lmm_par, moment_par):
        self.__lmm_par_ad = copy.deepcopy(lmm_par)
        self.__moment_par = copy.deepcopy(moment_par)

    # Return a posterior moment of interest as a function of unconstrained parameters.
    def GetMoments(self, free_par_vec):
        self.__lmm_par_ad.set_free(free_par_vec)
        set_moments(self.__lmm_par_ad, self.__moment_par)
        return self.__moment_par.get_vector()
    
    def GetMomentParameters(self, free_par_vec):
        self.__glmm_par_ad.set_free(free_par_vec)
        set_moments(self.__glmm_par_ad, self.__moment_par)
        return self.__moment_par


kl_wrapper = KLWrapper(lmm_par, lmm_draw, prior_par, x_mat, y_vec, y_g_vec, 20)
KLGrad = grad(kl_wrapper.KL)
KLHess = hessian(kl_wrapper.KL)
KLHessVecProd = hessian_vector_product(kl_wrapper.KL)  
print kl_wrapper.KL(free_par_vec)

moment_wrapper = MomentWrapper(glmm_par, moment_par)
MomentJacobian = jacobian(moment_wrapper.GetMoments)


212926.588541


In [None]:
for i in range(20):
    kl_wrapper.randomize(100)
    print kl_wrapper.KL(free_par_vec)

188177.53048
162593.041651
161145.033685
192736.296669
164586.133713
193432.045625
216665.020875
193305.197449
186711.445584
155214.22451
180269.635569
153917.20475
176383.755201
181361.909578
186164.417782
150720.142767
202345.782268
182852.594666
164742.622555
193238.87702


In [None]:
import timeit

kl_wrapper.randomize(10)

time_num = 10

print 'Function time:'
print timeit.timeit(lambda: kl_wrapper.KL(free_par_vec), number=time_num) / time_num

print 'Grad time:'
print timeit.timeit(lambda: KLGrad(free_par_vec), number=time_num) / time_num

print 'Hessian vector product time:'
print timeit.timeit(lambda: KLHessVecProd(free_par_vec, free_par_vec + 1), number=time_num) / time_num

# print 'Moment jacobian time:'
# print timeit.timeit(lambda: MomentJacobian(free_par_vec), number=time_num) / time_num

# time_num = 1
# print 'Prior Hessian time:'
# print timeit.timeit(lambda: PriorHess(combined_free_par_vec), number=time_num) / time_num

# so slow
# print 'Hessian time:'
# print timeit.timeit(lambda: KLHess(free_par_vec), number=time_num) / time_num


Function time:
0.0214753866196
Grad time:
0.0961890935898
Hessian vector product time:


In [91]:
import time

kl_wrapper.randomize(10)

class OptimizationPath(object):
    def __init__(self):
        self.x_history = []
        pass
    
    def save(self, x):
        self.x_history.append(x)

bfgs_path = OptimizationPath()
init_par_vec = copy.deepcopy(free_par_vec)

# Optimize.
vb_time = time.time()
print 'Running BFGS'
vb_opt_bfgs = optimize.minimize(
    lambda par: kl_wrapper.KL(par, verbose=True), init_par_vec,
    method='bfgs', jac=KLGrad, tol=1e-2, callback=bfgs_path.save,
    options={'maxiter': 100, 'gtol': 1e-2, 'disp': True})

trust_path = OptimizationPath()
print 'Running Newton Trust Region'
trust_init = copy.deepcopy(vb_opt_bfgs.x)
# trust_init = copy.deepcopy(init_par_vec)
vb_opt = optimize.minimize(
    lambda par: kl_wrapper.KL(par, verbose=True),
    trust_init, method='trust-ncg', jac=KLGrad, hessp=KLHessVecProd,
    tol=1e-6, callback=trust_path.save, options={'maxiter': 100, 'disp': True, 'gtol': 1e-6 })

vb_time = time.time() - vb_time

glmm_par_opt = copy.deepcopy(glmm_par)
glmm_par_opt.set_free(vb_opt.x)

print 'Done.'

print vb_time / 60

Running BFGS
159718.546755
inf
159402.162332
37754.8221728
inf
37753.8659136
33238.1654293
25618.4058358
9395.87803999
290111.322036
-5318.7119765
18048.8330503
-22133.104067
16931078.3526
24341.7221337
-22803.4747705
-24139.6836062
-29136.6647224
-41848.3920472
-89822.9417122
-156525.375573
-139682.315077
-156348.53069
-166809.115896
4.93818361347e+14
135605.603767
-171812.02376
-181012.206736
-197654.243268
-225054.551338
-261070.101376
-280651.464958
-287279.695547
-333925.042353
1301866.47321
-359672.037703
-362882.383091
-369045.576239
-386915.797514
-419121.935952
-447921.323809
-441164.474096
-471706.714064
-459202.093036
-302408.949904
-471243.690217
-494433.886884
inf
-494433.886884
-494433.886884
inf
-505233.014517
inf
-505233.073975
inf
-505233.073975
inf
inf
inf
inf
inf
-507351.754144
-508393.92301
-508911.418222
inf
-509040.446395
inf
-509072.682488
         Current function value: -505233.014517
         Iterations: 25
         Function evaluations: 68
         Gradient e

KeyboardInterrupt: 

In [10]:
print len(trust_path.x_history)

# for i in range(len(path.x_history) - 1):
#     print np.sum(path.x_history[i + 1] - path.x_history[i])

glmm_par.set_free(trust_path.x_history[len(trust_path.x_history) - 1])
# print glmm_par
    
for i in range(len(trust_path.x_history)):
    glmm_par.set_free(trust_path.x_history[i])
    print 'Iteration ' + str(i) + '\n'
    print str(glmm_par['tau']) + '\n'


34
Iteration 0

tau:
tau_shape: [ 793.03053848]
tau_rate: [ 0.00111538]

Iteration 1

tau:
tau_shape: [ 481.18719791]
tau_rate: [ 0.00183676]

Iteration 2

tau:
tau_shape: [ 292.03452586]
tau_rate: [ 0.00302497]

Iteration 3

tau:
tau_shape: [ 177.26119529]
tau_rate: [ 0.00498212]

Iteration 4

tau:
tau_shape: [ 107.6048248]
tau_rate: [ 0.00820574]

Iteration 5

tau:
tau_shape: [ 65.32508746]
tau_rate: [ 0.0135152]

Iteration 6

tau:
tau_shape: [ 39.66110148]
tau_rate: [ 0.02225919]

Iteration 7

tau:
tau_shape: [ 24.08359501]
tau_rate: [ 0.03665527]

Iteration 8

tau:
tau_shape: [ 14.63045394]
tau_rate: [ 0.0603385]

Iteration 9

tau:
tau_shape: [ 8.89778191]
tau_rate: [ 0.09921749]

Iteration 10

tau:
tau_shape: [ 5.42828539]
tau_rate: [ 0.16267264]

Iteration 11

tau:
tau_shape: [ 3.34104773]
tau_rate: [ 0.26460639]

Iteration 12

tau:
tau_shape: [ 2.10881876]
tau_rate: [ 0.42142371]

Iteration 13

tau:
tau_shape: [ 1.42623704]
tau_rate: [ 0.63695399]

Iteration 14

tau:
tau_shape: 

In [11]:
# print(glmm_par_opt)
if simulate_data:
    print true_beta
    print glmm_par_opt['beta']
    print '---------------\n'
    print true_tau
    print glmm_par_opt['tau'].e()

    e_u = glmm_par_opt['u'].e()
    info_u = glmm_par_opt['u'].info.get()
    var_u = 1 / info_u
    e_beta = glmm_par_opt['beta'].e()
    e_beta_outer = glmm_par_opt['beta'].e_outer()
    std_draws = kl_wrapper.std_draws

    rho_mean = e_u[y_g_vec] + np.matmul(x_mat, e_beta)
    rho_sd = np.sqrt(var_u[y_g_vec] + np.einsum('nk,kj,nj->n', x_mat, e_beta_outer, x_mat))
    z = np.einsum('i,j->ij', rho_sd, std_draws) + np.expand_dims(rho_mean, 1)
    logit_term = -np.einsum('ij->i', np.log1p(np.exp(z))) / std_draws.size

    print rho_sd
    print var_u[y_g_vec]
    # print np.mean(var_u)


In [12]:
# Check the random effect estimates.  This requires simulated data.
if simulate_data:
    from ggplot import *
    import pandas as pd
    %matplotlib inline
    
    print glmm_par_opt['mu'].e()
    print true_mu

    print glmm_par_opt['tau'].e()
    print true_tau

    plot_df = pd.DataFrame({ 'opt': glmm_par_opt['u'].mean.get(), 'true': true_u })
    print ggplot(plot_df, aes(x='true', y='opt')) + geom_point() + geom_abline(slope=1, intercept=0)
    
    plot_df = pd.DataFrame({ 'opt': glmm_par_opt['beta'].mean.get(), 'true': true_beta })
    print ggplot(plot_df, aes(x='true', y='opt')) + geom_point() + geom_abline(slope=1, intercept=0)
    
    plot_df = pd.DataFrame({ 'opt': logit_term, 'true': np.log(1 - true_rho) })
    print ggplot(plot_df, aes(x='true', y='opt')) + geom_point() + geom_abline(slope=1, intercept=0)

In [13]:
# LRVB with conjugate gradient.  This turns out to be way slower with any appreciable number of moments.
if False:
    from scipy.sparse.linalg import LinearOperator
    import sys

    # This will actually compute Hess^1 * moment_jac.T, leading to perhaps confusing
    # naming of "columns".  
    ObjHessVecProdLO = LinearOperator((vb_opt.x.size, vb_opt.x.size), lambda par: KLHessVecProd(vb_opt.x, par))
    # print moment_jac.T.shape
    # print ObjHessVecProdLO.shape
    # cg_res, info = scipy.sparse.linalg.cg(ObjHessVecProdLO, moment_jac.T)

    cg_time = time.time()
    lrvb_term = np.full(moment_jac.T.shape, float('nan'))
    for col in range(moment_jac.shape[0]):
        sys.stdout.write('.')
        sys.stdout.flush()
        cg_res, info = sp.sparse.linalg.cg(ObjHessVecProdLO, moment_jac[col, :])
        assert info == 0
        lrvb_term[:, col] = cg_res
    cg_time = time.time() - cg_time

    print 'all done dude'
else:
    cg_time = float('inf')

In [14]:
# Slow, but probably faster than using CG.
combined_free_par_vec = encode_combined_parameters(glmm_par_opt, prior_par)

hess_time = time.time()
print 'KL Hessian:\n'
kl_hess = KLHess(vb_opt.x)
print 'Log prior Hessian:\n'
log_prior_hess_full = PriorHess(combined_free_par_vec)
hess_time =  time.time() - hess_time
elbo_hess = -kl_hess

print 'hess_time: %f' % hess_time
print 'cg_time: %f' % cg_time

KL Hessian:

Log prior Hessian:

hess_time: 68.996338
cg_time: inf


In [15]:
glmm_inds = range(glmm_par_opt.free_size())
prior_inds = range(glmm_par_opt.free_size(), len(combined_free_par_vec))
log_prior_hess = log_prior_hess_full[np.ix_(prior_inds, glmm_inds)]

moment_jac = MomentJacobian(vb_opt.x)
lrvb_cov = np.matmul(moment_jac, np.linalg.solve(kl_hess, moment_jac.T))

prior_indices = copy.deepcopy(prior_par)
prior_indices.set_vector(1 + np.array(range(prior_indices.vector_size())))

vp_indices = copy.deepcopy(glmm_par_opt)
vp_indices.set_vector(1 + np.array(range(vp_indices.vector_size())))

In [16]:
if not simulate_data:
    run_name = 'debug'
    result_dict = { 'glmm_par_opt': glmm_par_opt.dictval(), 'run_name': run_name,
                    'vb_time': vb_time,'hess_time': hess_time, 
                    'moment_indices': moment_indices.dictval(),
                    'prior_indices': prior_indices.dictval(),
                    'vp_indices': vp_indices.dictval(),
                    'lrvb_cov': lrvb_cov.tolist(), 'moment_jac': moment_jac.tolist(),
                    'elbo_hess': elbo_hess.tolist(), 'log_prior_hess': log_prior_hess.tolist() }

    result_json = json.dumps(result_dict)
    json_file = open(json_output_filename, 'w')
    json_file.write(result_json)
    json_file.close()

    print(json_output_filename)

/home/rgiordan/Documents/git_repos/LRVBLogitGLMM/LogitGLMMLRVB/inst/data/simulated_data_large_python_vb_results.json


In [32]:
import math

mu_ind = moment_indices['e_mu'].get() - 1 # moment indices is one-indexed
print moment_par['e_mu']
print math.sqrt(lrvb_cov[mu_ind, mu_ind])

e_mu: -0.5253
0.0706139471208
