In [113]:
import autograd.numpy as np
import autograd
from autograd.core import primitive
import copy

from collections import OrderedDict

from VariationalBayes import Parameters
from VariationalBayes.Parameters import \
    ScalarParam, VectorParam, ArrayParam, \
    PosDefMatrixParam, PosDefMatrixParamVector
from VariationalBayes.ParameterDictionary import ModelParamsDict
import scipy as sp
from scipy.sparse import csr_matrix


In [7]:
k = 2

mat = np.full(k ** 2, 0.2).reshape(k, k) + np.eye(k)
vp_mat1 = PosDefMatrixParam('mat1', k, val=mat)
vp_mat2 = PosDefMatrixParam('mat2', k, val=mat * 2.)

mp = ModelParamsDict()
mp.push_param(vp_mat1)
mp.push_param(vp_mat2)

print(mp)

free_vec = mp.get_free()


ModelParamsDict:
	mat1:
[[ 1.2  0.2]
 [ 0.2  1.2]]
	mat2:
[[ 2.4  0.4]
 [ 0.4  2.4]]


In [44]:
def get_param(mp, free_vec, par_name):
    mp[par_name].set_free(free_vec[mp.free_indices_dict[par_name]])
    return mp[par_name].get()

print(get_param(mp, free_vec, 'mat1'))
print(get_param(mp, free_vec, 'mat2'))

get_param_jac = autograd.jacobian(get_param, argnum=1)

print('--------')
print(get_param_jac(mp, free_vec, 'mat1'))
print('--------')
print(get_param_jac(mp, free_vec, 'mat2'))



[[ 1.2  0.2]
 [ 0.2  1.2]]
[[ 2.4  0.4]
 [ 0.4  2.4]]
--------
[[[ 2.4         0.          0.          0.          0.          0.        ]
  [ 0.2         1.09544512  0.          0.          0.          0.        ]]

 [[ 0.2         1.09544512  0.          0.          0.          0.        ]
  [ 0.          0.36514837  2.33333333  0.          0.          0.        ]]]
--------
[[[ 0.          0.          0.          4.8         0.          0.        ]
  [ 0.          0.          0.          0.4         1.54919334  0.        ]]

 [[ 0.          0.          0.          0.4         1.54919334  0.        ]
  [ 0.          0.          0.          0.          0.51639778  4.66666667]]]


In [61]:
@primitive
def get_param_sparse(mp, free_vec, par_name):
    return get_param(mp, free_vec, par_name)

def get_free_vec(mp, free_vec, par_name):
    return free_vec[mp.free_indices_dict[par_name]]

def set_free_and_get(free_vec_par, par):
    par.set_free(free_vec_par)
    return par.get()

mat1_sub_vec = get_free_vec(mp, free_vec, 'mat1')
mat2_sub_vec = get_free_vec(mp, free_vec, 'mat2')
print(set_free_and_get(mat1_sub_vec, mp['mat1']))
print(set_free_and_get(mat2_sub_vec, mp['mat2']))

jac_dict = OrderedDict()
jac_dict['mat1'] = autograd.jacobian(lambda free_sub_vec: set_free_and_get(free_sub_vec, mp['mat1']))
jac_dict['mat2'] = autograd.jacobian(lambda free_sub_vec: set_free_and_get(free_sub_vec, mp['mat2']))

print("-----------------")
print(jac_dict['mat1'](mat1_sub_vec))
print("-----------------")
print(jac_dict['mat2'](mat2_sub_vec))
print("-----------------")


[[ 1.2  0.2]
 [ 0.2  1.2]]
[[ 2.4  0.4]
 [ 0.4  2.4]]
-----------------
[[[ 2.4         0.          0.        ]
  [ 0.2         1.09544512  0.        ]]

 [[ 0.2         1.09544512  0.        ]
  [ 0.          0.36514837  2.33333333]]]
-----------------
[[[ 4.8         0.          0.        ]
  [ 0.4         1.54919334  0.        ]]

 [[ 0.4         1.54919334  0.        ]
  [ 0.          0.51639778  4.66666667]]]
-----------------


In [98]:
foo = np.random.random((2, 2, 3))
bar = np.random.random((2, 2))

print((foo * np.expand_dims(bar, axis=2)).shape)
print(np.sum(foo * np.expand_dims(bar, axis=2), (0, 1)))
print(np.sum(foo * np.expand_dims(bar, axis=2), -2))
print(np.sum([ foo[:, :, k] * bar for k in range(3) ], (1, 2)))

(2, 2, 3)
[ 0.35932676  0.70286063  0.7597611 ]
[[ 0.30010286  0.60179243  0.53072853]
 [ 0.0592239   0.1010682   0.22903258]]
[ 0.35932676  0.70286063  0.7597611 ]


In [133]:
def get_param_sparse_vjp(g, ans, vs, gvs, mp, free_vec, par_name):
    jac = jac_dict[par_name](get_free_vec(mp, free_vec, par_name))
    par_jac = np.sum(jac * np.expand_dims(g, axis=2), (0, 1))
    full_jac = np.zeros(free_vec.shape)
    inds = mp.free_indices_dict[par_name]
    full_jac[inds] = par_jac
    return full_jac

    # Doesn't work:
    #return csr_matrix((np.full(6, 0), (np.array(range(6)), np.full(6, 0))), (6, 1))
    
get_param_sparse.defvjp(get_param_sparse_vjp, argnum=1)

get_param_sparse_jac = jacobian(get_param_sparse, argnum=1)
print(get_param_sparse_jac(mp, free_vec, 'mat2') - get_param_jac(mp, free_vec, 'mat2'))
print(get_param_sparse_jac(mp, free_vec, 'mat1') - get_param_jac(mp, free_vec, 'mat1'))



[[[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]]
[[[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.]]]


In [141]:
data = [1., 3., 4.]
rows = [0, 2, 4]
cols = [0, 0, 0]

full_jac_sparse = csr_matrix((data, (rows, cols)), (5, 1))
print(full_jac_sparse.toarray())


# Are duplicates summed?  Yes.
data = [1., 3., 4.]
rows = [0, 2, 2]
cols = [0, 0, 0]

full_jac_sparse = csr_matrix((data, (rows, cols)), (5, 1))
full_jac_sparse.toarray()



[[ 1.]
 [ 0.]
 [ 3.]
 [ 0.]
 [ 4.]]


array([[ 1.],
       [ 0.],
       [ 7.],
       [ 0.],
       [ 0.]])

In [140]:

# Hessians get appended to the end
get_param_hess = autograd.hessian(get_param, argnum=1)
mat1_hess = get_param_hess(mp, free_vec, 'mat1')
print(mat1_hess.shape)

# As do Jacobians
mat1_jac = get_param_jac(mp, free_vec, 'mat1')
print(mat1_jac.shape)


(2, 2, 6, 6)
(2, 2, 6)


I think we need two steps.  Let $L$ be the objective and $f$ be the constraining function, so that 

$$
\theta = f(z) \\
L(\theta) = L(f(z))
$$

We need

$$
\frac{dL}{dz^T} = \frac{dL}{d\theta^T} \frac{d\theta}{dz} = \frac{dL}{d\theta^T} \frac{df}{dz^T}
$$

and, using Einstein summation notation,

$$
\frac{d^2 L}{dz_i dz_j} =
    \frac{d^2 L}{d\theta_a d\theta_b} \frac{d\theta_a}{dz_i} \frac{d\theta_b}{dz_j} +
    \frac{d L}{d\theta_a} \frac{d^2 \theta_a}{dz_i dz_j}
$$

The term $\frac{d^2 L}{d\theta_a d\theta_b}$ can be expressed using a combination of our ```get_vector()``` functions and a sparse matrix for the local variables.  $\frac{d\theta_a}{dz_i}$ can also be represented as a sparse matrix.  It may be best to store the term $\frac{d^2 \theta_a}{dz_i dz_j}$ in ```(value, a, i, j)``` format, and write a custom aggregator to return a sparse matrix when multiplied by $\frac{d L}{d\theta_a}$, since it is possible that this is not efficient in general [(discussion)](https://stackoverflow.com/questions/29871669/python-multi-dimensional-sparse-array).
