In [1]:
from VariationalBayes import ScalarParam, ModelParamsDict, VectorParam, PosDefMatrixParam
from VariationalBayes.NormalParams import MVNParam, UVNParam, UVNParamVector
from VariationalBayes.GammaParams import GammaParam
from VariationalBayes.ExponentialFamilies import \
    UnivariateNormalEntropy, MultivariateNormalEntropy, GammaEntropy, \
    MVNPrior, UVNPrior, GammaPrior


from autograd import grad, hessian, jacobian, hessian_vector_product
import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy as asp
import scipy as sp

import copy
from scipy import optimize

In [2]:
# Load data saved by stan_results_to_json.R and run_stan.R in LRVBLogitGLMM.
import os
import json

analysis_name = 'simulated_data_small'
data_dir = os.path.join(os.environ['GIT_REPO_LOC'], 'LRVBLogitGLMM/LogitGLMMLRVB/inst/data/')
json_filename = os.path.join(data_dir, '%s_stan_dat.json' % analysis_name)
json_output_filename = os.path.join(data_dir, '%s_python_vb_results.json' % analysis_name)

json_file = open(json_filename, 'r')
stan_dat = json.load(json_file)
json_file.close()

print stan_dat.keys()
K = stan_dat['K'][0]
NObs = stan_dat['N'][0]
NG = stan_dat['NG'][0]
N = NObs / NG
y_g_vec = np.array(stan_dat['y_group'])
y_vec = np.array(stan_dat['y'])
x_mat = np.array(stan_dat['x'])

[u'y_group', u'mu_prior_var', u'mu_prior_t', u'mu_prior_var_c', u'K', u'beta_prior_var', u'tau_prior_beta', u'N', u'mu_prior_mean_c', u'mu_prior_epsilon', u'mu_prior_mean', u'y', u'x', u'NG', u'beta_prior_mean', u'tau_prior_alpha']


In [3]:
# Define a class to contain prior parameters.
prior_par = ModelParamsDict('Prior Parameters')

prior_par.push_param(VectorParam('beta_prior_mean', K, val=np.array(stan_dat['beta_prior_mean'])))
prior_par.push_param(PosDefMatrixParam('beta_prior_var', K, val=np.array(stan_dat['beta_prior_var'])))

prior_par.push_param(ScalarParam('mu_prior_mean', val=stan_dat['mu_prior_mean'][0]))
prior_par.push_param(ScalarParam('mu_prior_var', val=stan_dat['mu_prior_var'][0]))

prior_par.push_param(ScalarParam('tau_prior_alpha', val=stan_dat['tau_prior_alpha'][0]))
prior_par.push_param(ScalarParam('tau_prior_beta', val=stan_dat['tau_prior_beta'][0]))

# An index set to make sure jacobians match the order expected by R.
prior_par_indices = copy.deepcopy(prior_par)
prior_par_indices.set_name('Prior Indices')
prior_par_indices.set_vector(np.array(range(prior_par_indices.vector_size())))

In [4]:
# Simulate data instead of loading it if you like
if False:
    N = 20     # observations per group
    K = 25      # dimension of regressors
    NG = 500      # number of groups

    # Generate data
    def Logistic(u):
        return np.exp(u) / (1 + np.exp(u))

    NObs = NG * N
    true_beta = np.random.rand(K) - 0.5
    true_mu = 0.2
    true_tau = 4.0
    true_u = np.random.normal(true_mu, 1 / np.sqrt(true_tau), NG)

    x_mat = np.random.random(K * NObs).reshape(NObs, K) - 0.5
    y_g_vec = [ g for g in range(NG) for n in range(N) ]
    true_rho = Logistic(np.matmul(x_mat, true_beta) + true_u[y_g_vec])
    y_vec = np.random.random(NObs) < true_rho


In [5]:
# Build an object to contain a variational approximation to a K-dimensional multivariate normal.
glmm_par = ModelParamsDict('GLMM Parameters')

glmm_par.push_param(UVNParam('mu'))
glmm_par.push_param(GammaParam('tau'))
glmm_par.push_param(MVNParam('beta', K))
glmm_par.push_param(UVNParamVector('u', NG))

glmm_par['mu'].mean.set(0.1)
glmm_par['mu'].var.set(1.0)

glmm_par['tau'].shape.set(2.1)
glmm_par['tau'].rate.set(2.1)

glmm_par['beta'].mean.set(np.full(K, 0.))
glmm_par['beta'].cov.set(0.2 * np.eye(K))

glmm_par['u'].mean.set(np.full(NG, 0.))
glmm_par['u'].var.set(np.full(NG, 0.1))
    
print N * NG
print glmm_par

1000
GLMM Parameters:
	mu:
mu_mean: 0.1
mu_var: 1.0
	tau:
tau_shape: 2.1
tau_rate: 2.1
	beta:
beta_mean:
[ 0.  0.  0.  0.  0.]
beta_cov:
[[ 0.2  0.   0.   0.   0. ]
 [ 0.   0.2  0.   0.   0. ]
 [ 0.   0.   0.2  0.   0. ]
 [ 0.   0.   0.   0.2  0. ]
 [ 0.   0.   0.   0.   0.2]]
	u:
u_mean:
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
u_var:
[ 0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1
  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1
  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1
  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0

In [11]:
def DataLogLikelihood(x_mat, y_vec, e_beta, e_beta_outer, e_u, var_u, std_draws):
    rho_mean = e_u + np.matmul(x_mat, e_beta)
    rho_sd = np.sqrt(var_u + np.einsum('nk,kj,nj->n', x_mat, e_beta_outer, x_mat))
    z = np.einsum('i,j->ij', rho_sd, std_draws) + np.expand_dims(rho_mean, 1)

    # The sum is over observations and draws, so dividing by the draws size
    # gives the sum of sample expectations over the draws.
    # log(1 - p) = log(1 / (1 + exp(u))) = -log(1 + exp(u))
    logit_term = -np.sum(np.log1p(np.exp(z))) / std_draws.size
    y_term = np.sum(y_vec * rho_mean)
    return y_term + logit_term


def RandomEffectLogLikelihood(e_u, var_u, e_mu, var_mu, e_tau, e_log_tau):
    return -0.5 * e_tau * np.sum((e_mu - e_u) ** 2 + var_mu + var_u) + \
           0.5 * e_log_tau * len(e_u)

    
def ELogPrior(prior_par, glmm_par_elbo):
    e_beta = glmm_par_elbo['beta'].mean.get()
    cov_beta = glmm_par_elbo['beta'].cov.get()
    beta_prior_info = np.linalg.inv(prior_par['beta_prior_var'].get())
    e_log_p_beta = MVNPrior(prior_par['beta_prior_mean'].get(), beta_prior_info, e_beta, cov_beta)
    
    e_mu = glmm_par_elbo['mu'].mean.get()
    var_mu = glmm_par_elbo['mu'].var.get()
    e_log_p_mu = UVNPrior(prior_par['mu_prior_mean'].get(), 1 / prior_par['mu_prior_var'].get(), e_mu, var_mu) 

    e_tau = glmm_par_elbo['tau'].e()
    e_log_tau = glmm_par_elbo['tau'].e_log()
    e_log_p_tau = GammaPrior(prior_par['tau_prior_alpha'].get(), prior_par['tau_prior_beta'].get(), e_tau, e_log_tau)
    
    return  e_log_p_beta + e_log_p_mu + e_log_p_tau
           

def Elbo(y_vec, x_mat, y_g_vec, glmm_par_elbo, std_draws, prior_par):
    e_beta = glmm_par_elbo['beta'].mean.get()
    cov_beta = glmm_par_elbo['beta'].cov.get()
    e_beta_outer = glmm_par_elbo['beta'].e_outer()
    
    e_u = glmm_par_elbo['u'].mean.get()
    var_u = glmm_par_elbo['u'].var.get()
    
    e_mu = glmm_par_elbo['mu'].mean.get()
    var_mu = glmm_par_elbo['mu'].var.get()
    
    e_tau = glmm_par_elbo['tau'].e()
    e_log_tau = glmm_par_elbo['tau'].e_log()

    ll = \
        DataLogLikelihood(x_mat, y_vec, e_beta, e_beta_outer,
                          e_u[y_g_vec], var_u[y_g_vec], std_draws) + \
        RandomEffectLogLikelihood(e_u, var_u, e_mu, var_mu, e_tau, e_log_tau)

    e_log_prior = ELogPrior(prior_par, glmm_par_elbo)

    #UnivariateNormalEntropy(var_mu) + \

    entropy = \
        np.sum(asp.stats.norm.entropy(loc=e_mu, scale=np.sqrt(var_mu))) + \
        MultivariateNormalEntropy(cov_beta) + \
        UnivariateNormalEntropy(var_u) + \
        GammaEntropy(glmm_par_elbo['tau'].shape.get(), glmm_par_elbo['tau'].rate.get())

    return ll[0] + e_log_prior[0] + entropy


class KLWrapper():
    def __init__(self, glmm_par, prior_par, x_mat, y_vec, y_g_vec, num_draws):
        self.__glmm_par_ad = copy.deepcopy(glmm_par)
        self.__prior_par_ad = copy.deepcopy(prior_par)
        self.x_mat = x_mat
        self.y_vec = y_vec
        self.y_g_vec = y_g_vec
        draw_spacing = 1 / float(num_draws + 1)
        target_quantiles = np.linspace(draw_spacing, 1 - draw_spacing, num_draws)
        self.std_draws = sp.stats.norm.ppf(target_quantiles)

    def Eval(self, free_par_vec, verbose=False):
        self.__glmm_par_ad.set_free(free_par_vec)
        kl = -Elbo(self.y_vec, self.x_mat, self.y_g_vec,
                   self.__glmm_par_ad, self.std_draws, self.__prior_par_ad)
        if verbose: print kl
            
        # TODO: this is returning an array when it should be a scalar.
        return kl

    # Return a posterior moment of interest as a function of
    # unconstrained parameters.  In this case it is a bit silly,
    # but in full generality posterior moments may be a complicated
    # function of moment parameters.
    def GetMoments(self, free_par_vec):
        self.__glmm_par_ad.set_free(free_par_vec)
        return self.__glmm_par_ad['beta'].mean.get()



In [9]:
kl_wrapper = KLWrapper(glmm_par, prior_par, x_mat, y_vec, y_g_vec, 10)
KLGrad = grad(kl_wrapper.Eval)
KLHess = hessian(kl_wrapper.Eval)
MomentJacobian = jacobian(kl_wrapper.GetMoments)
KLHessVecProd = hessian_vector_product(kl_wrapper.Eval)  
free_par_vec = glmm_par.get_free()
kl_wrapper.Eval(free_par_vec)

934.630009976937

In [12]:
import timeit

time_num = 10

print 'Function time:'
print timeit.timeit(lambda: kl_wrapper.Eval(free_par_vec), number=time_num) / time_num

print 'Grad time:'
print timeit.timeit(lambda: KLGrad(free_par_vec), number=time_num) / time_num

print 'Hessian vector product time:'
print timeit.timeit(lambda: KLHessVecProd(free_par_vec, free_par_vec + 1), number=time_num) / time_num

# so slow
# print 'Hessian time:'
# print timeit.timeit(lambda: KLHess(free_par_vec), number=time_num) / time_num


Function time:


AttributeError: 'module' object has no attribute 'entropy'

In [None]:
import time

init_par_vec = free_par_vec

# Optimize.
vb_time = time.time()
print 'Running BFGS'
vb_opt_bfgs = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True), init_par_vec,
    method='bfgs', jac=KLGrad, tol=1e-2)

init_par_vec = free_par_vec
print 'Running Newton Trust Region'
vb_opt = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True),
    vb_opt_bfgs.x, method='trust-ncg', jac=KLGrad, hessp=KLHessVecProd, options={'maxiter': 5000})

vb_time = time.time() - vb_time

glmm_par_opt = copy.deepcopy(glmm_par)
glmm_par_opt.set_free(vb_opt.x)
print 'Done.'

In [None]:
print vb_time / 60

In [None]:
print(glmm_par_opt)

In [None]:
# Check the random effect estimates.  This requires simulated data.
if False:
    from ggplot import *
    import pandas as pd
    %matplotlib inline

    plot_df = pd.DataFrame({ 'opt': glmm_par_opt['u'].mean.get(), 'true': true_u })
    ggplot(plot_df, aes(x='true', y='opt')) + geom_point() + geom_abline(slope=1, intercept=0)

In [None]:
# LRVB with conjugate gradient
from scipy.sparse.linalg import LinearOperator
import sys

# The we will actually compute Hess^1 * moment_jac.T, leading to perhaps confusing
# naming of "columns".  
moment_jac = MomentJacobian(vb_opt.x)
ObjHessVecProdLO = LinearOperator((vb_opt.x.size, vb_opt.x.size), lambda par: KLHessVecProd(vb_opt.x, par))
# print moment_jac.T.shape
# print ObjHessVecProdLO.shape
# cg_res, info = scipy.sparse.linalg.cg(ObjHessVecProdLO, moment_jac.T)

cg_time = timeit.timeit()
lrvb_term = np.full(moment_jac.T.shape, float('nan'))
for col in range(moment_jac.shape[0]):
    sys.stdout.write('.')
    sys.stdout.flush()
    cg_res, info = sp.sparse.linalg.cg(ObjHessVecProdLO, moment_jac[col, :])
    assert info == 0
    lrvb_term[:, col] = cg_res
cg_time = timeit.timeit() - cg_time

print 'all done dude'

In [None]:
# Slow, but maybe faster than using CG.
hess_time = timeit.timeit()
kl_hess = KLHess(vb_opt.x)
hess_time =  timeit.timeit() - hess_time

In [None]:
print kl_hess[:,:].shape
print moment_jac.shape

beta_cov_hess = np.matmul(moment_jac, np.linalg.solve(kl_hess[:, :], moment_jac.T))
beta_cov = np.matmul(moment_jac, lrvb_term)
print np.diag(beta_cov)
print np.diag(beta_cov_hess)
print np.diag(glmm_par_opt['beta'].cov.get())

In [None]:
result_dict = { 'glmm_par_opt': glmm_par_opt.dictval(), 'vb_time': vb_time }

result_json = json.dumps(result_dict)
json_file = open(json_output_filename, 'w')
json_file.write(result_json)
json_file.close()

print(json_output_filename)

