In [1]:
import autograd.numpy as np
import autograd.scipy as sp
import autograd
from autograd.core import primitive
import copy

from collections import OrderedDict

from VariationalBayes import Parameters
from VariationalBayes.Parameters import \
    ScalarParam, VectorParam, ArrayParam, \
    PosDefMatrixParam, PosDefMatrixParamVector
from VariationalBayes.ParameterDictionary import ModelParamsDict
import scipy as osp
from scipy.sparse import csr_matrix


In [2]:
k = 2

mat = np.full(k ** 2, 0.2).reshape(k, k) + np.eye(k)
vp_mat1 = PosDefMatrixParam('mat1', k, val=mat)
vp_mat2 = PosDefMatrixParam('mat2', k, val=mat * 2.)

mp = ModelParamsDict()
mp.push_param(vp_mat1)
mp.push_param(vp_mat2)

print(mp)

free_vec = mp.get_free()


ModelParamsDict:
	mat1:
[[ 1.2  0.2]
 [ 0.2  1.2]]
	mat2:
[[ 2.4  0.4]
 [ 0.4  2.4]]


In [3]:
def get_param(mp, free_vec, par_name):
    mp[par_name].set_free(free_vec[mp.free_indices_dict[par_name]])
    return mp[par_name].get()

print(get_param(mp, free_vec, 'mat1'))
print(get_param(mp, free_vec, 'mat2'))

get_param_jac = autograd.jacobian(get_param, argnum=1)

print('--------')
print(get_param_jac(mp, free_vec, 'mat1'))
print('--------')
print(get_param_jac(mp, free_vec, 'mat2'))



[[ 1.2  0.2]
 [ 0.2  1.2]]
[[ 2.4  0.4]
 [ 0.4  2.4]]
--------
[[[ 2.4         0.          0.          0.          0.          0.        ]
  [ 0.2         1.09544512  0.          0.          0.          0.        ]]

 [[ 0.2         1.09544512  0.          0.          0.          0.        ]
  [ 0.          0.36514837  2.33333333  0.          0.          0.        ]]]
--------
[[[ 0.          0.          0.          4.8         0.          0.        ]
  [ 0.          0.          0.          0.4         1.54919334  0.        ]]

 [[ 0.          0.          0.          0.4         1.54919334  0.        ]
  [ 0.          0.          0.          0.          0.51639778  4.66666667]]]


In [4]:
@primitive
def get_param_sparse(mp, free_vec, par_name):
    return get_param(mp, free_vec, par_name)

def get_free_vec(mp, free_vec, par_name):
    return free_vec[mp.free_indices_dict[par_name]]

def set_free_and_get(free_vec_par, par):
    par.set_free(free_vec_par)
    return par.get()

mat1_sub_vec = get_free_vec(mp, free_vec, 'mat1')
mat2_sub_vec = get_free_vec(mp, free_vec, 'mat2')
print(set_free_and_get(mat1_sub_vec, mp['mat1']))
print(set_free_and_get(mat2_sub_vec, mp['mat2']))

jac_dict = OrderedDict()
jac_dict['mat1'] = autograd.jacobian(lambda free_sub_vec: set_free_and_get(free_sub_vec, mp['mat1']))
jac_dict['mat2'] = autograd.jacobian(lambda free_sub_vec: set_free_and_get(free_sub_vec, mp['mat2']))

print("-----------------")
print(jac_dict['mat1'](mat1_sub_vec))
print("-----------------")
print(jac_dict['mat2'](mat2_sub_vec))
print("-----------------")


[[ 1.2  0.2]
 [ 0.2  1.2]]
[[ 2.4  0.4]
 [ 0.4  2.4]]
-----------------
[[[ 2.4         0.          0.        ]
  [ 0.2         1.09544512  0.        ]]

 [[ 0.2         1.09544512  0.        ]
  [ 0.          0.36514837  2.33333333]]]
-----------------
[[[ 4.8         0.          0.        ]
  [ 0.4         1.54919334  0.        ]]

 [[ 0.4         1.54919334  0.        ]
  [ 0.          0.51639778  4.66666667]]]
-----------------


In [5]:
foo = np.random.random((2, 2, 3))
bar = np.random.random((2, 2))

print((foo * np.expand_dims(bar, axis=2)).shape)
print(np.sum(foo * np.expand_dims(bar, axis=2), (0, 1)))
print(np.sum(foo * np.expand_dims(bar, axis=2), -2))
print(np.sum([ foo[:, :, k] * bar for k in range(3) ], (1, 2)))

(2, 2, 3)
[ 0.93946763  0.91549415  1.01996112]
[[ 0.47065522  0.83499778  0.58313265]
 [ 0.46881241  0.08049637  0.43682847]]
[ 0.93946763  0.91549415  1.01996112]


In [7]:
def get_param_sparse_vjp(g, ans, vs, gvs, mp, free_vec, par_name):
    jac = jac_dict[par_name](get_free_vec(mp, free_vec, par_name))
    par_jac = np.sum(jac * np.expand_dims(g, axis=2), (0, 1))
    full_jac = np.zeros(free_vec.shape)
    inds = mp.free_indices_dict[par_name]
    full_jac[inds] = par_jac
    return full_jac

    # Doesn't work:
    #return csr_matrix((np.full(6, 0), (np.array(range(6)), np.full(6, 0))), (6, 1))
    
get_param_sparse.defvjp(get_param_sparse_vjp, argnum=1)

get_param_sparse_jac = autograd.jacobian(get_param_sparse, argnum=1)
print(get_param_sparse_jac(mp, free_vec, 'mat2') - get_param_jac(mp, free_vec, 'mat2'))
print(get_param_sparse_jac(mp, free_vec, 'mat1') - get_param_jac(mp, free_vec, 'mat1'))



[[[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]]
[[[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]]


In [8]:
data = [1., 3., 4.]
rows = [0, 2, 4]
cols = [0, 0, 0]

full_jac_sparse = csr_matrix((data, (rows, cols)), (5, 1))
print(full_jac_sparse.toarray())


# Are duplicates summed?  Yes.
data = [1., 3., 4.]
rows = [0, 2, 2]
cols = [0, 0, 0]

full_jac_sparse = csr_matrix((data, (rows, cols)), (5, 1))
full_jac_sparse.toarray()



[[ 1.]
 [ 0.]
 [ 3.]
 [ 0.]
 [ 4.]]


array([[ 1.],
       [ 0.],
       [ 7.],
       [ 0.],
       [ 0.]])

In [9]:

# Hessians get appended to the end
get_param_hess = autograd.hessian(get_param, argnum=1)
mat1_hess = get_param_hess(mp, free_vec, 'mat1')
print(mat1_hess.shape)

# As do Jacobians
mat1_jac = get_param_jac(mp, free_vec, 'mat1')
print(mat1_jac.shape)


(2, 2, 6, 6)
(2, 2, 6)


I think we need two steps.  Let $L$ be the objective and $f$ be the constraining function, so that 

$$
\theta = f(z) \\
L(\theta) = L(f(z))
$$

We need

$$
\frac{dL}{dz^T} = \frac{dL}{d\theta^T} \frac{d\theta}{dz} = \frac{dL}{d\theta^T} \frac{df}{dz^T}
$$

and, using Einstein summation notation,

$$
\frac{d^2 L}{dz_i dz_j} =
    \frac{d^2 L}{d\theta_a d\theta_b} \frac{d\theta_a}{dz_i} \frac{d\theta_b}{dz_j} +
    \frac{d L}{d\theta_a} \frac{d^2 \theta_a}{dz_i dz_j}
$$

The term $\frac{d^2 L}{d\theta_a d\theta_b}$ can be expressed using a combination of our ```get_vector()``` functions and a sparse matrix for the local variables.  $\frac{d\theta_a}{dz_i}$ can also be represented as a sparse matrix.  It may be best to store the term $\frac{d^2 \theta_a}{dz_i dz_j}$ in ```(value, a, i, j)``` format, and write a custom aggregator to return a sparse matrix when multiplied by $\frac{d L}{d\theta_a}$, since it is possible that this is not efficient in general [(discussion)](https://stackoverflow.com/questions/29871669/python-multi-dimensional-sparse-array).


In [10]:
def set_free_and_get(par, free_vec_par):
    par.set_free(free_vec_par)
    return par.get_vector()

set_free_and_get_jacobian = autograd.jacobian(set_free_and_get, argnum=1)
set_free_and_get_hessian = autograd.hessian(set_free_and_get, argnum=1)


In [11]:
# For an ArrayParam
s = ArrayParam(name='scalar', shape=(5, 3), lb=0.0)
s.set(np.exp(np.random.random(s.shape())))

target_jac = set_free_and_get_jacobian(s, s.get_free())
target_hess = set_free_and_get_hessian(s, s.get_free())

# Pre-compute necessary stuff
s.__lb = 0.0
s.__ub = float('inf')
constrain_grad = autograd.grad(Parameters.constrain)
free_val = s.get_free()

# Get a sparse version of the Jacobian
rows = np.array(range(s.vector_size()))
grads = [ constrain_grad(free_val[vec_ind], s.__lb, s.__ub) \
          for vec_ind in range(s.vector_size()) ]
    
jac_sparse = csr_matrix((grads, (rows, rows)), (s.vector_size(), s.free_size()))

print(np.max(np.abs(jac_sparse - target_jac)))

constrain_hess = autograd.hessian(Parameters.constrain)

def get_ind_hess(vec_ind):
    hess = constrain_hess(free_val[vec_ind], s.__lb, s.__ub)
    return csr_matrix(([ hess ], ([vec_ind], [vec_ind])), (s.free_size(), s.vector_size()))

# Get a sparse version of the Hessian
hesses = np.array([ get_ind_hess(vec_ind)
                    for vec_ind in range(s.vector_size()) ])

print(np.max(np.abs([ hesses[ind].toarray() - target_hess[ind] for ind in range(s.vector_size()) ])))

0.0
0.0


In [42]:
# For a SimplexParam
from VariationalBayes.MultinomialParams import SimplexParam
s = SimplexParam(name='simplex', shape=(2, 3))
s_val = np.random.random(s.shape())
s_val = s_val / np.expand_dims(np.sum(s_val, 1), axis=1)
s.set(s_val)

free_val = s.get_free()

target_jac = set_free_and_get_jacobian(s, free_val)
target_hess = set_free_and_get_hessian(s, free_val)

# Evidently the free params are in the columns and the vector params are in the rows.
print(target_jac.shape)


(6, 4)


In [50]:
def constrain_simplex_vector(free_vec):
    # The first column is the reference value.
    free_vec_aug = np.hstack([[0.], free_vec])
    log_norm = sp.misc.logsumexp(free_vec_aug)
    return np.exp(free_vec_aug - log_norm)

constrain_simplex_vector(free_vec)
constrain_grad = autograd.jacobian(constrain_simplex_vector)
constrain_grad(free_vec)
constrain_hess = autograd.hessian(constrain_simplex_vector)
constrain_hess(free_vec).shape

(7, 6, 6)

In [47]:

jac_rows = []
jac_cols = []
grads = []
free_cols = range(s.free_shape()[1])
vec_cols = range(s.shape()[1])
for row in range(s.shape()[0]):
    # Each of the output depends only on one row of the input.
    free_inds = np.ravel_multi_index([[row], free_cols], s.free_shape())
    vec_inds = np.ravel_multi_index([[row], vec_cols], s.shape())
    row_jac = constrain_grad(free_val[free_inds])
    for vec_col in vec_cols:
        for free_col in free_cols: 
            jac_rows.append(vec_inds[vec_col])
            jac_cols.append(free_inds[free_col])
            grads.append(row_jac[vec_col,free_col])
            
            
jac_sparse = csr_matrix((grads, (jac_rows, jac_cols)), (s.vector_size(), s.free_size()))
print(np.max(np.abs(jac_sparse - target_jac)))


0.0


In [62]:
free_cols = range(s.free_shape()[1])
vec_cols = range(s.shape()[1])
hesses = []
hess_shape = (s.free_size(), s.free_size())

for row in range(s.shape()[0]):
    # Each of the output depends only on one row of the input.
    free_inds = np.ravel_multi_index([[row], free_cols], s.free_shape())
    vec_inds = np.ravel_multi_index([[row], vec_cols], s.shape())
    row_hess = constrain_hess(free_val[free_inds])
    #print(row_hess)
    for vec_col in vec_cols:
        vec_ind = vec_inds[vec_col]
        hess_rows = []
        hess_cols = []
        hess_vals = []
        for free_col1 in free_cols:
            for free_col2 in free_cols:
                hess_rows.append(free_inds[free_col1])
                hess_cols.append(free_inds[free_col2])
                hess_vals.append(row_hess[vec_col, free_col1, free_col2])
        hesses.append(csr_matrix((hess_vals, (hess_rows, hess_cols)), hess_shape))

print(np.max(np.abs([ hesses[ind].toarray() - target_hess[ind] for ind in range(s.vector_size()) ])))

[array([[-0.03473296,  0.03607573,  0.        ,  0.        ],
       [ 0.03607573, -0.02944033,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]]), array([[ 0.06552671, -0.03079374,  0.        ,  0.        ],
       [-0.03079374, -0.00528199,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]]), array([[-0.03079374, -0.00528199,  0.        ,  0.        ],
       [-0.00528199,  0.03472232,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ]]), array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.00331646,  0.03308621],
       [ 0.        ,  0.        ,  0.03308621, -0.00730627]]), array([[ 0.    