In [1]:
from VariationalBayes import ScalarParam, ModelParamsDict, VectorParam
from VariationalBayes.NormalParams import MVNParam, UVNParam
from VariationalBayes.GammaParams import GammaParam

# import math

from autograd import grad, hessian, jacobian, hessian_vector_product
import autograd.numpy as np
import autograd.numpy.random as npr
import autograd.scipy as asp
import scipy as sp

import copy
from scipy import optimize


In [2]:
# This should go in the main library.
from VariationalBayes.Parameters import set_free_offset, get_free_offset

class UVNParamVector(object):
    def __init__(self, name, length, min_var=0.0):
        self.name = name
        self.mean = VectorParam(name + '_mean', length)
        self.var = VectorParam(name + '_var', length, lb=min_var)
        self.__free_size = self.mean.free_size() + self.var.free_size()
    def __str__(self):
        return self.name + ':\n' + str(self.mean) + '\n' + str(self.var)
    def names(self):
        return self.mean.names() + self.var.names()
    def e(self):
        return self.mean.get()
    def e_outer(self):
        mean = self.mean.get() ** 2 + self.var.get()
    def set_free(self, free_val):
        if free_val.size != self.__free_size: \
            raise ValueError('Wrong size for UVNParam ' + self.name)
        offset = 0
        offset = set_free_offset(self.mean, free_val, offset)
        offset = set_free_offset(self.var, free_val, offset)
    def get_free(self):
        vec = np.empty(self.__free_size)
        offset = 0
        offset = get_free_offset(self.mean, vec, offset)
        offset = get_free_offset(self.var, vec, offset)
        return vec
    def free_size(self):
        return self.__free_size

In [3]:
N = 20     # observations per group
K = 25      # dimension of regressors
NG = 500      # number of groups

# Build an object to contain a variational approximation to a K-dimensional multivariate normal.
glmm_par = ModelParamsDict()

glmm_par.push_param(UVNParam('mu'))
glmm_par.push_param(GammaParam('tau'))
glmm_par.push_param(MVNParam('beta', K))
glmm_par.push_param(UVNParamVector('u', NG))

glmm_par['mu'].mean.set(0.1)
glmm_par['mu'].var.set(1.0)

glmm_par['tau'].shape.set(2.1)
glmm_par['tau'].rate.set(2.1)

glmm_par['beta'].mean.set(np.full(K, 0.))
glmm_par['beta'].cov.set(0.2 * np.eye(K))

glmm_par['u'].mean.set(np.full(NG, 0.))
glmm_par['u'].var.set(np.full(NG, 0.1))
    
print N * NG
print glmm_par

10000
ModelParamsList:
	mu:
mu_mean: 0.1
mu_var: 1.0
	tau:
tau_shape: 2.1
tau_rate: 2.1
	beta:
beta_mean:
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]
beta_cov:
[[ 0.2  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.2  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.2  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.2  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.2  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0.   0.2  0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0. ]
 [ 0.  

In [4]:
# Generate data

def Logistic(u):
    return np.exp(u) / (1 + np.exp(u))

NObs = NG * N
true_beta = np.random.rand(K) - 0.5
true_mu = 0.2
true_tau = 4.0
true_u = np.random.normal(true_mu, 1 / np.sqrt(true_tau), NG)

x_mat = np.random.random(K * NObs).reshape(NObs, K) - 0.5
y_g_vec = [ g for g in range(NG) for n in range(N) ]
true_rho = Logistic(np.matmul(x_mat, true_beta) + true_u[y_g_vec])
y_vec = np.random.random(NObs) < true_rho


In [5]:
def DataLogLikelihood(x_mat, y_vec, e_beta, e_beta_outer, e_u, var_u, std_draws):
    rho_mean = e_u + np.matmul(x_mat, e_beta)
    rho_sd = np.sqrt(var_u + np.einsum('nk,kj,nj->n', x_mat, e_beta_outer, x_mat))
    z = np.einsum('i,j->ij', rho_sd, std_draws) + np.expand_dims(rho_mean, 1)

    # The sum is over observations and draws, so dividing by the draws size
    # gives the sum of sample expectations over the draws.
    # log(1 - p) = log(1 / (1 + exp(u))) = -log(1 + exp(u))
    logit_term = -np.sum(np.log1p(np.exp(z))) / std_draws.size
    y_term = np.sum(y_vec * rho_mean)
    return y_term + logit_term


def RandomEffectLogLikelihood(e_u, var_u, e_mu, var_mu, e_tau, e_log_tau):
    return -0.5 * e_tau * np.sum((e_mu - e_u) ** 2 + var_mu + var_u) + \
           0.5 * e_log_tau * len(e_u)

def UnivariateNormalEntropy(var_obs):
    return 0.5 * np.sum(np.log(var_obs))

def MultivariateNormalEntropy(cov_obs):
    sign, logdet = np.linalg.slogdet(cov_obs)
    assert sign > 0
    return 0.5 * logdet

def GammaEntropy(shape, rate):
    return np.sum(shape - np.log(rate) + asp.special.gammaln(shape) + \
                  (1 - shape) * asp.special.digamma(shape))

def MVNPrior(prior_mean, prior_info, e_obs, cov_obs):
    obs_diff = e_obs - prior_mean
    return -0.5 * (np.dot(obs_diff, np.matmul(prior_info, obs_diff)) + \
                   np.trace(np.matmul(prior_info, cov_obs)))

def UVNPrior(prior_mean, prior_info, e_obs, var_obs):
    return -0.5 * (prior_info * ((e_obs - prior_mean) ** 2 + var_obs))


def Elbo(y_vec, x_mat, y_g_vec, glmm_par_elbo, std_draws):
    e_beta = glmm_par_elbo['beta'].mean.get()
    cov_beta = glmm_par_elbo['beta'].cov.get()
    e_beta_outer = glmm_par_elbo['beta'].e_outer()
    
    e_u = glmm_par_elbo['u'].mean.get()
    var_u = glmm_par_elbo['u'].var.get()
    
    e_mu = glmm_par_elbo['mu'].mean.get()
    var_mu = glmm_par_elbo['mu'].var.get()
    
    e_tau = glmm_par_elbo['tau'].e()
    e_log_tau = glmm_par_elbo['tau'].e_log()

    ll = \
        DataLogLikelihood(x_mat, y_vec, e_beta, e_beta_outer,
                          e_u[y_g_vec], var_u[y_g_vec], std_draws) + \
        RandomEffectLogLikelihood(e_u, var_u, e_mu, var_mu, e_tau, e_log_tau)

    K = len(e_beta)
    beta_prior_info = 0.01 * np.eye(K)
    beta_prior_mean = np.full(K, 0.)
    e_log_prior = \
        MVNPrior(beta_prior_mean, beta_prior_info, e_beta, cov_beta) + \
        UVNPrior(0., 0.01, e_mu, var_mu)
    # TODO: add the other priors.
        
    entropy = \
        UnivariateNormalEntropy(var_mu) + \
        MultivariateNormalEntropy(cov_beta) + \
        UnivariateNormalEntropy(var_u) + \
        GammaEntropy(glmm_par_elbo['tau'].shape.get(), glmm_par_elbo['tau'].rate.get())

    return ll + e_log_prior + entropy


class KLWrapper():
    def __init__(self, glmm_par, x_mat, y_vec, y_g_vec, num_draws):
        self.__glmm_par_ad = copy.deepcopy(glmm_par)
        self.x_mat = x_mat
        self.y_vec = y_vec
        self.y_g_vec = y_g_vec
        draw_spacing = 1 / float(num_draws + 1)
        target_quantiles = np.linspace(draw_spacing, 1 - draw_spacing, num_draws)
        self.std_draws = sp.stats.norm.ppf(target_quantiles)
    def Eval(self, free_par_vec, verbose=False):
        self.__glmm_par_ad.set_free(free_par_vec)
        kl = -Elbo(self.y_vec, self.x_mat, self.y_g_vec,
                   self.__glmm_par_ad, std_draws=self.std_draws)
        if verbose: print kl
            
        # TODO: this is returning an array when it should be a scalar.
        return kl
    
    # Return a posterior moment of interest as a function of
    # unconstrained parameters.  In this case it is a bit silly,
    # but in full generality posterior moments may be a complicated
    # function of moment parameters.
    def GetMoments(self, free_par_vec):
        self.__glmm_par_ad.set_free(free_par_vec)
        return self.__glmm_par_ad['beta'].mean.get()



In [6]:
kl_wrapper = KLWrapper(glmm_par, x_mat, y_vec, y_g_vec, 10)
KLGrad = grad(kl_wrapper.Eval)
KLHess = hessian(kl_wrapper.Eval)
MomentJacobian = jacobian(kl_wrapper.GetMoments)
KLHessVecProd = hessian_vector_product(kl_wrapper.Eval)  
free_par_vec = glmm_par.get_free()
kl_wrapper.Eval(free_par_vec)

array([ 8258.03559159])

In [7]:
import timeit

time_num = 10

print 'Function time:'
print timeit.timeit(lambda: kl_wrapper.Eval(free_par_vec), number=time_num) / time_num

print 'Grad time:'
print timeit.timeit(lambda: KLGrad(free_par_vec), number=time_num) / time_num

print 'Hessian vector product time:'
print timeit.timeit(lambda: KLHessVecProd(free_par_vec, free_par_vec + 1), number=time_num) / time_num

# so slow
# print 'Hessian time:'
# print timeit.timeit(lambda: KLHess(free_par_vec), number=time_num) / time_num


Function time:
0.0267796039581
Grad time:
0.0617132902145
Hessian vector product time:
0.124184703827


In [20]:
import time

init_par_vec = free_par_vec

# Optimize.
vb_time = time.time()
print 'Running BFGS'
vb_opt_bfgs = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True), init_par_vec,
    method='bfgs', jac=KLGrad, tol=1e-2)

init_par_vec = free_par_vec
print 'Running Newton Trust Region'
vb_opt = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True),
    vb_opt_bfgs.x, method='trust-ncg', jac=KLGrad, hessp=KLHessVecProd, options={'maxiter': 5000})

vb_time = time.time() - vb_time

glmm_par_opt = copy.deepcopy(glmm_par)
glmm_par_opt.set_free(vb_opt.x)
print 'Done.'

Running BFGS
[ 8258.03559159]
[ 7896.90175157]
[ 7733.0211885]
[ 7961.65628904]
[ 7728.25538486]
[ 7718.72530915]
[ 7683.52173482]
[ 7609.73983243]
[ 7551.44749102]
[ 7487.15548114]
[ 7413.63723853]
[ 7284.91011658]
[ 7135.99017236]
[ 8867.0610226]
[ 7128.90258369]
[ 7126.44582448]
[ 7121.82615834]
[ 7112.9496807]
[ 7086.3516225]
[ 7060.0125588]
[ 7055.45721978]
[ 7049.46344753]
[ 7041.15251789]
[ 7029.99093826]
[ 7037.34355311]
[ 7027.90051709]
[ 7024.64796729]
[ 7018.9362333]
[ 7010.50785824]
[ 7027.27089144]
[ 7010.15397752]
[ 7008.39875653]
[ 7006.37804853]
[ 7002.96490371]
[ 6997.61646134]
[ 6998.84054996]
[ 6996.16524046]
[ 6996.17298279]
[ 6995.50934295]
[ 6994.72046378]
[ 6993.31586376]
[ 6990.67688393]
[ 6986.38640103]
[ 7007.49994518]
[ 6986.24861762]
[ 6985.65346028]
[ 6984.73870776]
[ 6983.04822998]
[ 6980.01171048]
[ 6980.84284367]
[ 6978.61211662]
[ 6977.98377532]
[ 6976.80697659]
[ 6974.86823006]
[ 6976.3127161]
[ 6974.5749048]
[ 6974.20397652]
[ 6973.50942508]
[ 6972.16

In [22]:
print vb_time / 60

1.81472855012


In [None]:
# Check the random effect estimates
from ggplot import *
import pandas as pd
%matplotlib inline

plot_df = pd.DataFrame({ 'opt': glmm_par_opt['u'].mean.get(), 'true': true_u })
ggplot(plot_df, aes(x='true', y='opt')) + geom_point() + geom_abline(slope=1, intercept=0)

In [None]:
print moment_jac.shape
print K

In [None]:
# LRVB with conjugate gradient
from scipy.sparse.linalg import LinearOperator
import sys

# The we will actually compute Hess^1 * moment_jac.T, leading to perhaps confusing
# naming of "columns".  
moment_jac = MomentJacobian(vb_opt.x)
ObjHessVecProdLO = LinearOperator((vb_opt.x.size, vb_opt.x.size), lambda par: KLHessVecProd(vb_opt.x, par))
# print moment_jac.T.shape
# print ObjHessVecProdLO.shape
# cg_res, info = scipy.sparse.linalg.cg(ObjHessVecProdLO, moment_jac.T)

cg_time = timeit.timeit()
lrvb_term = np.full(moment_jac.T.shape, float('nan'))
for col in range(moment_jac.shape[0]):
    sys.stdout.write('.')
    sys.stdout.flush()
    cg_res, info = sp.sparse.linalg.cg(ObjHessVecProdLO, moment_jac[col, :])
    assert info == 0
    lrvb_term[:, col] = cg_res
cg_time = timeit.timeit() - cg_time

print 'all done dude'

In [None]:
# Slow, but maybe faster than using CG.
hess_time = timeit.timeit()
kl_hess = KLHess(vb_opt.x)
hess_time =  timeit.timeit() - hess_time

In [None]:
print kl_hess[0,:,:].shape
print moment_jac.shape

beta_cov_hess = np.matmul(moment_jac, np.linalg.solve(kl_hess[0, :, :], moment_jac.T))
beta_cov = np.matmul(moment_jac, lrvb_term)
print np.diag(beta_cov)
print np.diag(beta_cov_hess)
print np.diag(glmm_par_opt['beta'].cov.get())