In [1]:
import VariationalBayes as vb
import LogisticGLMM_lib as logit_glmm
from VariationalBayes.SparseObjectives import Objective, SparseObjective

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

from autograd import jacobian

import copy
from scipy import optimize

import os
import json


In [2]:
# Load data saved by stan_results_to_json.R and run_stan.R in LRVBLogitGLMM.

simulate_data = True

if not simulate_data:
    #analysis_name = 'simulated_data_small'
    #analysis_name = 'simulated_data_large'
    analysis_name = 'criteo_subsampled'

    data_dir = os.path.join(os.environ['GIT_REPO_LOC'], 'LRVBLogitGLMM/LogitGLMMLRVB/inst/data/')
    json_filename = os.path.join(data_dir, '%s_stan_dat.json' % analysis_name)
    json_output_filename = os.path.join(data_dir, '%s_python_vb_results.json' % analysis_name)

    json_file = open(json_filename, 'r')
    json_dat = json.load(json_file)
    json_file.close()

    stan_dat = json_dat['stan_dat']
    vp_base = json_dat['vp_base']

    print(stan_dat.keys())
    K = stan_dat['K'][0]
    NObs = stan_dat['N'][0]
    NG = stan_dat['NG'][0]
    #N = NObs / NG
    y_g_vec = np.array(stan_dat['y_group'])
    y_vec = np.array(stan_dat['y'])
    x_mat = np.array(stan_dat['x'])
    
    mu_info_min = vp_base['mu_info_min'][0]
    tau_alpha_min = vp_base['tau_alpha_min'][0]
    tau_beta_min = vp_base['tau_beta_min'][0]
    beta_diag_min = vp_base['beta_diag_min'][0]
    u_info_min = vp_base['u_info_min'][0]
    
    # Define a class to contain prior parameters.
    prior_par = logit_glmm.get_default_prior_params(K)
    prior_par['beta_prior_mean'].set(np.array(stan_dat['beta_prior_mean']))
    prior_par['beta_prior_var'].set(np.array(stan_dat['beta_prior_var']))

    beta_prior_info = np.linalg.inv(np.array(stan_dat['beta_prior_var']))
    prior_par['beta_prior_info'].set(np.array(stan_dat['beta_prior_info']))

    prior_par.push_param(vb.ScalarParam('mu_prior_mean', val=stan_dat['mu_prior_mean'][0]))
    prior_par.push_param(vb.ScalarParam('mu_prior_info', val=1 / stan_dat['mu_prior_var'][0]))

    prior_par['mu_prior_mean'].set(stan_dat['mu_prior_mean'][0])
    prior_par['mu_prior_info'].set(1 / stan_dat['mu_prior_var'][0])
    
    prior_par['tau_prior_alpha'].set(stan_dat['tau_prior_alpha'][0])
    prior_par['tau_prior_beta'].set(stan_dat['tau_prior_beta'][0])
    
    # An index set to make sure jacobians match the order expected by R.
    prior_par_indices = copy.deepcopy(prior_par)
    prior_par_indices.set_name('Prior Indices')
    prior_par_indices.set_vector(np.array(range(prior_par_indices.vector_size())))
else:
    # Simulate data instead of loading it if you like
    N = 200     # observations per group
    K = 5      # dimension of regressors
    NG = 200      # number of groups

    # Generate data

    true_beta = np.array(range(5))
    true_beta = true_beta - np.mean(true_beta)
    true_mu = 0.
    true_tau = 40.0

    x_mat, y_g_vec, y_vec, true_rho, true_u = \
        logit_glmm.simulate_data(N, NG, true_beta, true_mu, true_tau)

    prior_par = logit_glmm.get_default_prior_params(K)

    mu_info_min = 0.001
    tau_alpha_min = 0.001
    tau_beta_min = 0.001
    beta_diag_min = 0.001
    u_info_min = 0.001
    

print(np.mean(y_vec))

0.499975


In [3]:
# Build an object to contain a variational approximation to a K-dimensional multivariate normal.
glmm_par = vb.ModelParamsDict('GLMM Parameters')

# print(vp_base)

glmm_par.push_param(
    vb.UVNParam('mu', min_info=mu_info_min))
glmm_par.push_param(
    vb.GammaParam('tau', min_shape=tau_alpha_min, min_rate=tau_beta_min))
glmm_par.push_param(vb.MVNParam('beta', K, min_info=beta_diag_min))
glmm_par.push_param(vb.UVNParamVector('u', NG, min_info=u_info_min))


glmm_init = False
if glmm_init and not simulate_data:
    # Initialize with GLMM.  Don't forget to add the ADVI computation time to your final VB time!
    glmm_time = 0.

    glmm_fit = json_dat['glmm_fit']
    glmm_par['mu'].mean.set(glmm_fit['mu_mean'][0])
    glmm_par['mu'].info.set(1.0)

    tau_mean = 1.0 / glmm_fit['mu_sd'][0] ** 2
    tau_var = 1.0
    glmm_par['tau'].shape.set((tau_mean ** 2) / tau_var)
    glmm_par['tau'].rate.set(tau_var / tau_mean)

    glmm_par['beta'].mean.set(np.array(glmm_fit['beta_mean']))
    glmm_par['beta'].info.set(np.eye(K))

    glmm_par['u'].mean.set(np.array(glmm_fit['u_map']))
    glmm_par['u'].info.set(np.full(NG, 1.0))

    free_par_vec = glmm_par.get_free()
else:
    glmm_time = 0.
    glmm_par['mu'].mean.set(0.0)
    glmm_par['mu'].info.set(1.0)

    glmm_par['tau'].shape.set(2.0)
    glmm_par['tau'].rate.set(2.0)

    glmm_par['beta'].mean.set(np.full(K, 0.0))
    glmm_par['beta'].info.set(np.eye(K))

    glmm_par['u'].mean.set(np.full(NG, 0.0))
    glmm_par['u'].info.set(np.full(NG, 1.0))

free_par_vec = glmm_par.get_free()
init_par_vec = copy.deepcopy(free_par_vec)


In [5]:
# Define moment parameters

moment_wrapper = logit_glmm.MomentWrapper(glmm_par)
get_moment_jacobian = jacobian(moment_wrapper.get_moments)

# Moment indices.
moment_indices = copy.deepcopy(moment_wrapper.moment_par)
moment_indices.set_vector(1 + np.array(range(moment_indices.vector_size())))

model = logit_glmm.LogisticGLMM(glmm_par, prior_par, x_mat, y_vec, y_g_vec, num_gh_points=10)
model.get_e_log_prior()
model.get_log_lik()
model.get_entropy()

objective = Objective(model.glmm_par, model.get_kl)
objective.fun_free(free_par_vec)

# # PriorHess evaluates the second order derivative d2 EPrior / dpar dprior_par
# PriorModelGrad = grad(kl_wrapper.ExpectedLogPrior, argnum=0)
# PriorHess = jacobian(PriorModelGrad, argnum=1)

# kl_wrapper.ExpectedLogPrior(free_par_vec, prior_par.get_vector())

import timeit

time_num = 10

for num_draws in [3, 10, 50]:
    model.set_gh_points(num_draws)
    print('num_draws = ', num_draws)
    print('\tFunction time:',
          timeit.timeit(lambda: objective.fun_free(free_par_vec), number=time_num) / time_num)

    print('\tGrad time:', 
          timeit.timeit(lambda: objective.fun_free_grad(free_par_vec), number=time_num) / time_num)

    print('\tHessian vector product time:',
          timeit.timeit(lambda: objective.fun_free_hvp(free_par_vec, free_par_vec + 1), number=time_num) / time_num)

num_draws =  3
	Function time: 0.01842404900235124
	Grad time: 0.06247951309778728
	Hessian vector product time: 0.13204940749856178
num_draws =  10
	Function time: 0.03588724230066873
	Grad time: 0.08114857070031575
	Hessian vector product time: 0.1539680525980657
num_draws =  50
	Function time: 0.10707351379969623
	Grad time: 0.1642403385019861
	Hessian vector product time: 0.3056439498992404


In [6]:
glmm_par_opt = copy.deepcopy(glmm_par)
def tr_optimize(trust_init, num_gh_points, gtol=1e-6, maxiter=500):
    model.set_gh_points(num_gh_points)
    objective.logger.initialize()
    objective.logger.print_every = 5
    vb_opt = optimize.minimize(
        lambda par: objective.fun_free(par, verbose=True),
        x0=trust_init,
        method='trust-ncg',
        jac=objective.fun_free_grad,
        hessp=objective.fun_free_hvp,
        tol=1e-6, options={'maxiter': maxiter, 'disp': True, 'gtol': gtol })
    return vb_opt.x

def get_moment_vec(vb_opt_x):
    glmm_par_opt.set_free(vb_opt_x)
    set_moments(glmm_par_opt, moment_par)
    return moment_par.get_vector()


In [7]:
import pandas as pd
#print(glmm_par)

x_t_x = np.matmul(x_mat.transpose(), x_mat)
x_t_y = np.matmul(x_mat.transpose(), y_vec)
beta_init = np.linalg.solve(x_t_x, x_t_y)
#print(beta_init)
#plt.plot(sp.special.expit(np.matmul(x_mat, beta_init)), y_vec, 'k.')

df = pd.DataFrame({ 'y_g': y_g_vec, 'y': y_vec}).groupby('y_g')
#print(df.sum())
u_init = np.array(df.sum()) / np.array(df.count()['y'])
#plt.figure()
print(np.min(y_g_vec))
#plt.plot(u_init[y_g_vec], y_vec, 'k.')


0


In [8]:
import time

# Optimize.

print('Running Newton Trust Region with few draws.')
num_gh_points = 3
vb_time = time.time()
opt_x = tr_optimize(init_par_vec, num_gh_points, gtol=1e-3, maxiter=100)
vb_time_opt_1 = time.time() - vb_time
print('vb_time_opt_1: ', vb_time_opt_1)
# vb_time = time.time() - vb_time

print('Running Newton Trust Region with more draws')
num_gh_points = 20
# vb_time = time.time()
opt_x = tr_optimize(opt_x, num_gh_points, gtol=1e-6, maxiter=100)
vb_time = time.time() - vb_time

print('VB time: ', vb_time)
print('Done.')

Running Newton Trust Region with few draws.
Iter  0  value:  [ 33760.43159946]
Iter  5  value:  [ 26145.72346039]
Iter  10  value:  [ 24256.55874011]
Iter  15  value:  [ 24256.02612667]
Optimization terminated successfully.
         Current function value: 24256.024586
         Iterations: 18
         Function evaluations: 19
         Gradient evaluations: 19
         Hessian evaluations: 0
Running Newton Trust Region with more draws


KeyboardInterrupt: 

In [None]:

glmm_par.set_free(opt_x)
if simulate_data:
    print('Fit beta: ', glmm_par['beta'].e())
    print('True beta: ', true_beta)
    
    print('Fit mu: ', glmm_par['mu'].e())
    print('True mu: ', true_mu)
    
    print('Fit tau: ', glmm_par['tau'].e())
    print('True tau: ', true_tau)
    
    plt.plot(true_u, true_u, 'r.')
    plt.plot(true_u, glmm_par['u'].e(), 'k.')
    

In [None]:
print(vb_time_opt_1)
print(vb_time - vb_time_opt_1)


In [None]:
print('VB time: ', vb_time)

import scipy as sp

glmm_par_opt = copy.deepcopy(glmm_par)
glmm_par_opt.set_free(opt_x)
#glmm_par_opt.set_free(init_par_vec)
print(glmm_fit['beta_mean'])
print(glmm_par_opt['beta'].e())
print(glmm_par_opt)

#plt.plot(glmm_par_opt['u'].e(), glmm_par_opt['u'].var(), 'k.')

e_beta = glmm_par_opt['beta'].e()
e_u = glmm_par_opt['u'].e()[model.y_g_vec]

z_mean = e_u + np.matmul(model.x_mat, e_beta)
#plt.plot(sp.special.expit(z_mean), model.y_vec, 'k.')


plt.figure()
plt.hist(z_mean[model.y_vec == 1], 50, normed=1, facecolor='green', alpha=0.75)
plt.title('y == 1')

plt.figure()
plt.hist(z_mean[model.y_vec == 0], 50, normed=1, facecolor='green', alpha=0.75)
plt.title('y == 0')




In [None]:
print(len(opt_x))
print(glmm_par.free_size())

glmm_par_opt = copy.deepcopy(glmm_par)
glmm_par_opt.set_free(opt_x)
moment_wrapper.glmm_par.set_free(opt_x)
moment_wrapper.set_moments()

print(vb_time / 60)

In [None]:
if False:
    # Investigate the performance of different numbers of draws.   It doesn't appear to
    # converge.
    opt_x_20 = tr_optimize(init_par_vec, 20)
    opt_x_60 = tr_optimize(opt_x_20, 60)
    opt_x_100 = tr_optimize(opt_x_60, 100)
    opt_x_200 = tr_optimize(opt_x_100, 200)
    opt_x_400 = tr_optimize(opt_x_200, 400)
    opt_x_800 = tr_optimize(opt_x_400, 800)
    
    mom_20 = get_moment_vec(opt_x_20)
    mom_60 = get_moment_vec(opt_x_60)
    mom_100 = get_moment_vec(opt_x_100)
    mom_200 = get_moment_vec(opt_x_200)
    mom_400 = get_moment_vec(opt_x_400)
    mom_800 = get_moment_vec(opt_x_800)

    print np.max(np.abs((mom_20 - mom_60) / mom_20))
    print np.max(np.abs((mom_60 - mom_100) / mom_60))
    print np.max(np.abs((mom_100 - mom_200) / mom_100))
    print np.max(np.abs((mom_200 - mom_400) / mom_200))
    print np.max(np.abs((mom_400 - mom_800) / mom_400))

    print '-------\n'
    print np.max(np.abs((mom_20 - mom_60)))
    print np.max(np.abs((mom_60 - mom_100)))
    print np.max(np.abs((mom_100 - mom_200)))
    print np.max(np.abs((mom_200 - mom_400)))
    print np.max(np.abs((mom_400 - mom_800)))

    #diff_inds = np.where(np.abs(mom_60 - mom_100) > 1e-2)
    #print diff_inds
    #print moment_indices

    #print (get_moment_vec(opt_x_60) - get_moment_vec(opt_x_100)) / np.abs(get_moment_vec(opt_x_100))
    get_moment_vec(opt_x_200)
    u200 = copy.deepcopy(moment_par['e_u'].get())
    get_moment_vec(opt_x_400)
    u400 = copy.deepcopy(moment_par['e_u'].get())
    get_moment_vec(opt_x_800)
    u800 = copy.deepcopy(moment_par['e_u'].get())


In [None]:
# Examine why the means are different for different number of simulations.
def get_logit_terms(num_draws):
    model.set_draws(num_draws)
    std_draws = model.std_draws

    e_beta = glmm_par_opt['beta'].mean.get()
    info_beta = glmm_par_opt['beta'].info.get()
    cov_beta = np.linalg.inv(info_beta)

    e_u = glmm_par_opt['u'].mean.get()[y_g_vec]
    info_u = glmm_par_opt['u'].info.get()[y_g_vec]
    var_u = 1 / info_u

    z_mean = e_u + np.matmul(x_mat, e_beta)
    z_sd = np.sqrt(var_u + np.einsum('nk,kj,nj->n', x_mat, cov_beta, x_mat))
    z = np.einsum('i,j->ij', z_sd, std_draws) + np.expand_dims(z_mean, 1)

    # The sum is over observations and draws, so dividing by the draws size
    # gives the sum of sample expectations over the draws.
    # p = exp(z) / (1 + exp(z))
    # log(1 - p) = log(1 / (1 + exp(z))) = -log(1 + exp(z))
    logit_terms = np.log1p(np.exp(z))
    logit_term = -np.sum(logit_terms) / std_draws.size

    return logit_term, logit_terms, z
    
logit_term_50, logit_terms_50, z_50 = get_logit_terms(50)    
logit_term_800, logit_terms_800, z_800 = get_logit_terms(800)

print( logit_term_50)
print( logit_term_800)

logit_terms_50_mean = np.mean(logit_terms_50, 1)
logit_terms_800_mean = np.mean(logit_terms_800, 1)

print( np.max(np.abs(logit_terms_50_mean - logit_terms_800_mean)))
print( np.where(np.abs(logit_terms_50_mean - logit_terms_800_mean) > 1e-3))

ind = 3
plt.plot(z_800[ind, :], logit_terms_800[ind, :])
plt.plot(z_50[ind, :], logit_terms_50[ind, :])
print( logit_terms_50_mean[ind])
print( logit_terms_800_mean[ind])


In [None]:
# Get the Hessians at the number of draws used for optimization.

model.set_draws(num_mc_draws)

hess_time = time.time()
print('KL Hessian:\n')
kl_hess = objective.fun_free_hessian(opt_x)

print('Log prior Hessian:\n')
log_prior_hess = PriorHess(opt_x, prior_par.get_vector())

hess_time =  time.time() - hess_time
elbo_hess = -kl_hess

print('hess_time: %f' % hess_time)

In [None]:
moment_jac = MomentJacobian(opt_x)
lrvb_cov = np.matmul(moment_jac, np.linalg.solve(kl_hess, moment_jac.T))

prior_indices = copy.deepcopy(prior_par)
prior_indices.set_vector(1 + np.array(range(prior_indices.vector_size())))

vp_indices = copy.deepcopy(glmm_par_opt)
vp_indices.set_vector(1 + np.array(range(vp_indices.vector_size())))

In [None]:
if not simulate_data:
    # Write the result to a JSON file for use in R.
    
    run_name = 'production'
    result_dict = { 'glmm_par_opt': glmm_par_opt.dictval(), 'run_name': run_name,
                    'vb_time': vb_time, 'hess_time': hess_time, 'num_mc_draws': num_mc_draws, 
                    'moment_indices': moment_indices.dictval(),
                    'prior_indices': prior_indices.dictval(),
                    'vp_indices': vp_indices.dictval(),
                    'lrvb_cov': lrvb_cov.tolist(), 'moment_jac': moment_jac.tolist(),
                    'elbo_hess': elbo_hess.tolist(), 'log_prior_hess': log_prior_hess.tolist() }

    result_json = json.dumps(result_dict)
    json_file = open(json_output_filename, 'w')
    json_file.write(result_json)
    json_file.close()

    print(json_output_filename)