In [3]:
from VariationalBayes import VectorParam, ScalarParam, PosDefMatrixParam, ModelParamsDict
from autograd import grad, hessian, jacobian
import math
import autograd.numpy as np
import autograd.numpy.random as npr
import copy
from scipy import optimize

In [94]:
# Build an object to contain a variational approximation to a K-dimensional multivariate normal.

K = 3
mvn_par = ModelParamsDict()

mvn_par.push_param(VectorParam('e_mu', K))
mvn_par.push_param(VectorParam('var_mu', K, lb=0))

mvn_par['e_mu'].set(np.full(K, 0.1))
mvn_par['var_mu'].set(np.full(K, 2.))

def Logistic(rho):
    return np.exp(rho) / (1 + np.exp(rho))

print Logistic(0.5)
print Logistic(np.array([3., 4.]))


# From Stan:
#     inline double log1m_inv_logit(double u) {
#       using std::exp;
#       if (u > 0.0)
#         return -u - log1p(exp(-u));  // prevent underflow
#       return -log1p(exp(u));
#     }

def Log1mInvLogit(u):
    return -np.log1p(np.exp(-u))
    
Log1mInvLogit(5.0)
Log1mInvLogit(np.array([4., 3.]))


0.622459331202
[ 0.95257413  0.98201379]


array([-0.01814993, -0.04858735])

In [79]:
# Generate data

N = 100
true_mu = np.random.rand(K).T - 0.5
x_mat = np.full([N, K], float('nan'))
y_vec = np.full([N], float('nan'))
for n in range(N):
    x_mat[n, :] = np.random.random(K) - 0.5
    y_vec[n] = np.random.random(1) < Logistic(np.dot(x_mat[n, :], true_mu))

print K
print np.random.random(K)
print x_mat[1,:]

3
[ 0.20925728  0.68612165  0.1123016 ]
[-0.34514176 -0.08677326  0.11753542]


In [101]:
a = np.array([2., 3.])
a * a
np.sum(a * a * a)

foo_mat = np.full([10, 3], 2.)

# for foo_row in foo_mat:
#     print '-----\n'
#     print foo_row

print foo_mat.shape[0]  


10


In [93]:
from scipy import stats


num_draws = 10
target_quantiles = np.linspace(1 / float(num_draws + 1), 1 - 1 / float(num_draws + 1), num_draws)
std_draws = scipy.stats.norm.ppf(target_quantiles)

x_row = x_mat[1]
y = y_vec[1]

e_mu = true_mu
mu_var = 2

rho_mean = np.dot(x_row, e_mu)
rho_sd = math.sqrt(np.sum(x_row * x_row * mu_var))
e_log_1mrho = np.mean(Log1mInvLogit(std_draws * rho_sd + rho_mean))

print rho_mean
print rho_sd
print e_log_1mrho
print Log1mInvLogit(rho_mean)
print std_draws * rho_sd + rho_mean




0.0456006707902
2.65016042208
-1.09201937507
-0.670606750294
[-3.49283452 -2.36195842 -1.55664749 -0.87865787 -0.25700868  0.34821002
  0.96985921  1.64784883  2.45315976  3.58403586]


In [111]:
# Define the variational objective
def LogLikelihood(x_row, y, e_mu, mu_var, std_draws):
    # logit(rho) is the probability of y being 1, which has a normal distribution under q().
    rho_mean = np.dot(x_row, e_mu)
    rho_sd = np.sqrt(np.sum(x_row * x_row * mu_var))
    e_log_1mrho = np.mean(Log1mInvLogit(std_draws * rho_sd + rho_mean))
    return y * rho_mean + e_log_1mrho


def UnivariateNormalExpectedEntropy(var_mu):
    return 0.5 * np.log(var_mu)


def Elbo(y_vec, x_mat, mvn_par_elbo, num_draws=10):
    var_mu = mvn_par_elbo['var_mu'].get()
    e_mu = mvn_par_elbo['e_mu'].get()

    num_draws = 10
    draw_spacing = 1 / float(num_draws + 1)
    target_quantiles = np.linspace(draw_spacing, 1 - draw_spacing, num_draws)
    std_draws = scipy.stats.norm.ppf(target_quantiles)

    assert y_vec.size == x_mat.shape[0]
    assert e_mu.size == x_mat.shape[1]

    ll = 0
    for n in range(y_vec.size):
        ll += LogLikelihood(x_mat[n, :], y_vec[n], e_mu, var_mu, std_draws)

    entropy = sum([ UnivariateNormalExpectedEntropy(var_mu_k) for var_mu_k in var_mu])

    return ll + entropy


class KLWrapper():
    def __init__(self, mvn_par, x_mat, y_vec, num_draws):
        self.__mvn_par_ad = copy.deepcopy(mvn_par)
        self.x_mat = x_mat
        self.y_vec = y_vec
        self.num_draws = num_draws
        
    def Eval(self, free_par_vec, verbose=False):
        self.__mvn_par_ad.set_free(free_par_vec)
        kl = -Elbo(self.y_vec, self.x_mat, self.__mvn_par_ad, num_draws=self.num_draws)
        if verbose: print kl
        return kl
    
    # Return a posterior moment of interest as a function of
    # unconstrained parameters.  In this case it is a bit silly,
    # but in full generality posterior moments may be a complicated
    # function of moment parameters.
    def GetMu(self, free_par_vec):
        self.__mvn_par_ad.set_free(free_par_vec)
        return self.__mvn_par_ad['e_mu'].get()

    
kl_wrapper = KLWrapper(mvn_par, x_mat, y_vec, 10)
KLGrad = grad(kl_wrapper.Eval)
KLHess = hessian(kl_wrapper.Eval)
MomentJacobian = jacobian(kl_wrapper.GetMu)

print mvn_par
mvn_par_ad = copy.deepcopy(mvn_par)
print mvn_par_ad
        
mvn_par['e_mu'].set(np.array([1., 2., 3.]))
print mvn_par['e_mu']
print mvn_par_ad['e_mu']


ModelParamsList:
	var_mu: [ 2.  2.  2.]
	e_mu: [ 1.  2.  3.]
ModelParamsList:
	e_mu: [ 1.  2.  3.]
	var_mu: [ 2.  2.  2.]
e_mu: [ 1.  2.  3.]
e_mu: [ 1.  2.  3.]


In [112]:
# Check that the AD functions are working:
free_par_vec = mvn_par.get_free()
print kl_wrapper.Eval(free_par_vec)
print KLGrad(free_par_vec)
print KLHess(free_par_vec)
print MomentJacobian(free_par_vec)

90.3637408438
[ 1.68791352  6.6138131   4.75812245  1.09712622  3.45784933  8.02444811]
[[ 1.76989961 -0.04916582 -0.35450619 -0.03815249 -0.06505899 -0.06444692]
 [-0.04916582  1.60702487  0.25394906 -0.02481552 -0.12399148 -0.1236297 ]
 [-0.35450619  0.25394906  1.25392492 -0.00833073 -0.03277036 -0.16887471]
 [-0.03815249 -0.02481552 -0.00833073  1.54802586 -0.06801167 -0.12478301]
 [-0.06505899 -0.12399148 -0.03277036 -0.06801167  3.6678677  -0.27765126]
 [-0.06444692 -0.1236297  -0.16887471 -0.12478301 -0.27765126  7.24896791]]
[[ 1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.]]


In [114]:
import timeit

time_num = 10

# Pretty fast!

print 'Function time:'
print timeit.timeit(lambda: kl_wrapper.Eval(free_par_vec), number=time_num) / time_num

print 'Grad time:'
print timeit.timeit(lambda: KLGrad(free_par_vec), number=time_num) / time_num

print 'Hessian time:'
print timeit.timeit(lambda: KLHess(free_par_vec), number=time_num) / time_num

print 'Hessian vector product time:'
print timeit.timeit(lambda: KLHess(free_par_vec), number=time_num) / time_num


Function time:
0.00340960025787
Grad time:
0.0393475055695
Hessian time:
0.291237211227


In [7]:
# Set initial values.

# Is there not a better way than reduce?
true_means = reduce(lambda x, y: x + y, x_draws) / N

mvn_par['e_mu'].set(np.full(K, 1.0))
init_par_vec = mvn_par.get_free()

In [8]:
# Optimize.

print 'Running BFGS'
vb_opt_bfgs = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True), init_par_vec,
    method='bfgs', jac=KLGrad, tol=1e-6)
print 'Running Newton Trust Region'
vb_opt = optimize.minimize(
    lambda par: kl_wrapper.Eval(par, verbose=True),
    vb_opt_bfgs.x, method='trust-ncg', jac=KLGrad, hess=KLHess)
mvn_par_opt = copy.deepcopy(mvn_par)
mvn_par_opt.set_free(vb_opt.x)
print 'Done.'

Running BFGS
2732.76192496
inf
2732.73603852


  result = self.fun(*argvals, **kwargs)
  result = self.fun(*argvals, **kwargs)
  result = self.fun(*argvals, **kwargs)


1558.32455446
303669.763199
1553.74963671
1544.56032712
1508.83904737
1382.23377577
1095.86488507
641.186715507
268.675994152
115.096105596
55541.5199031
114.867294796
114.405755296
112.572701931
105.446077532
79.9825880193
27.2497601176
69.2804084502
17.5547960997
1.43803056927
-9.24001870708
-25.0536097994
-34.3067174834
-38.7341808603
-40.6332768776
-41.3151337798
-41.4932009191
-41.5201766956
-41.5229853189
-41.524823725
-41.5280095513
-41.5332868461
-41.5399419253
-41.5415517145
-41.5421710368
-41.5422947856
-41.5423871811
-41.5425438616
-41.5427929597
-41.5430401867
-41.5431013571
-41.5431166283
-41.543117084
-41.5431170911
-41.5431170911
Running Newton Trust Region
-41.5431170911
-41.5431170911
Done.


In [9]:
# The mean parameters match, as expected.
print mvn_par_opt['e_mu']
print true_means

e_mu: [ 0.89950874  0.62786963  0.8276449 ]
[ 0.89950874  0.62786963  0.8276449 ]


In [10]:
# LRVB
moment_jac = MomentJacobian(vb_opt.x)
opt_hess = KLHess(vb_opt.x)
mu_cov = np.matmul(moment_jac, np.linalg.solve(opt_hess, moment_jac.T))

# The VB variance is underestimated.
print np.diag(mu_cov)
print mvn_par_opt['var_mu']

[ 0.01  0.01  0.01]
var_mu: [ 0.00147368  0.00147368  0.00147368]
