In [1]:
import paragami

import autograd
from autograd import numpy as np

# Use the original scipy for functions we don't need to differentiate.
import scipy as osp

For illustration, let's consider a simple example: a Gaussian maximum likelihood estimator.

$$
x_n \overset{iid}\sim \mathcal{N}(\mu, \Sigma)\textrm{, for }n=1,...,N.
$$

Let $X = (x_1, ..., x_N)$.  We will minimize the loss

$$
\ell(X, \mu, \Sigma) = \frac{1}{2}\sum_{n=1}^N \left((x_n - \mu)^T \Sigma^{-1} (x_n - \mu) + \log |\Sigma| \right).
$$



In [8]:
np.random.seed(42)

num_obs = 1000

# True values of parameters
true_sigma = \
    np.eye(3) * np.diag(np.array([1, 2, 3])) + \
    np.random.random((3, 3)) * 0.1
true_sigma = 0.5 * (true_sigma + true_sigma.T)

true_mu = np.array([0, 1, 2])

# Data
def draw_data(num_obs, true_sigma, true_mu):
    return np.random.multivariate_normal(
        mean=true_mu, cov=true_sigma, size=(num_obs, ))

x = draw_data(num_obs, true_sigma, true_mu)
print('X shape: {}'.format(x.shape))

# It will be useful later to have a function that evalutes the loss at each datapoint.
# The loss is the negative log likelihood.
def get_observation_loss(x, sigma, mu):
    print('mu: ', mu)
    sigma_inv = np.linalg.inv(sigma)
    sigma_det_sign, sigma_log_det = np.linalg.slogdet(sigma)
    if sigma_det_sign <= 0:
        return np.full(float('inf'), x.shape[0])
    else:
        x_centered = x - np.expand_dims(mu, axis=0)
        return 0.5 * (
            np.einsum('ni,ij,nj->n', x_centered, sigma_inv, x_centered) + \
            sigma_log_det)

def get_loss(x, norm_param_dict):
    print('dict mu: ', norm_param_dict['mu'])
    return np.sum(
        get_observation_loss(
            x, norm_param_dict['sigma'], norm_param_dict['mu']))

true_norm_param_dict = dict()
true_norm_param_dict['sigma'] = true_sigma
true_norm_param_dict['mu'] = true_mu

print('Loss at true parameter: {}'.format(get_loss(x, true_norm_param_dict)))

X shape: (1000, 3)
dict mu:  [0 1 2]
mu:  [0 1 2]
Loss at true parameter: 2392.751922600241


Note that we have written our loss as a function of a *dictionary of parameters*.  We can use `paragami` to convert such a dictionary to and from a flat, unconstrained parameterization for optimization and sensitivity analysis.

In [3]:
# We make a pattern that matches the input to `get_loss`.
norm_param_pattern = paragami.PatternDict()
norm_param_pattern['sigma'] = paragami.PSDMatrixPattern(size=3)
norm_param_pattern['mu'] = paragami.NumericArrayPattern(shape=(3, ))

norm_param_freeflat = norm_param_pattern.flatten(true_norm_param_dict, free=True)

print('Here is the pattern definition: {}'.format(norm_param_pattern))
print('The flat parameter has shape: {}'.format(norm_param_freeflat.shape))
print('The flat parameter has value: {}'.format(norm_param_freeflat))

Here is the pattern definition: OrderedDict:
	[sigma] = PDMatrix 3x3 (diag_lb = 0.0)
	[mu] = Array (3,) (lb=-inf, ub=inf)
The flat parameter has shape: (9,)
The flat parameter has value: [0.01838482 0.0760574  0.34902187 0.03878424 0.03397008 0.55879116
 0.         1.         2.        ]


In [13]:
def pack(free_vec):
    res_dict = norm_param_pattern.fold(free_vec, free=True)
    return np.sum(get_observation_loss(x, true_sigma, res_dict['mu']))

pack(init_param)
gradtest = autograd.jacobian(pack)
gradtest(init_param)


mu:  [1 5 1]
mu:  Autograd ArrayBox with value [1 5 1]


array([   0,    0,    0,    0,    0,    0,  874, 1943, -323])

We can use this flat parameter to optimize the likelihood direclty without worrying about the PSD constraint on $\Sigma$.

In [14]:
print('First, wrap the loss to be a function of the flat parameter.')
get_freeflat_loss = paragami.FlattenedFunction(
    original_fun=get_loss,
    patterns=norm_param_pattern,
    free=True,
    argnums=1)
print('The resulting function: {}'.format(get_freeflat_loss))

print('\nNow, use the flattened function to optimize with autograd.\n')

get_freeflat_loss_grad = autograd.grad(get_freeflat_loss, argnum=1)
get_freeflat_loss_hessian = autograd.hessian(get_freeflat_loss, argnum=1)

def get_optimum(x, init_param):
    loss_opt = osp.optimize.minimize(
        method='trust-ncg',
        x0=init_param,
        fun=lambda par: get_freeflat_loss(x, par),
        jac=lambda par: get_freeflat_loss_grad(x, par),
        hess=lambda par: get_freeflat_loss_hessian(x, par),
        options={'gtol': 1e-8, 'disp': True})
    return loss_opt

# Initialize with zeros as long as the flat length of the normal parameters.
init_param = np.full(norm_param_pattern.flat_length(free=True), 1)
mle_opt = get_optimum(x, init_param)

First, wrap the loss to be a function of the flat parameter.
The resulting function: Function: <function get_loss at 0x7f5722113b70>
argnums: [1]
free: [ True]
patterns: [<paragami.pattern_containers.PatternDict object at 0x7f575c573550>]

Now, use the flattened function to optimize with autograd.

dict mu:  Autograd ArrayBox with value [1. 1. 1.]
mu:  Autograd ArrayBox with value [1. 1. 1.]
dict mu:  Autograd ArrayBox with value Autograd ArrayBox with value [1. 1. 1.]
mu:  Autograd ArrayBox with value Autograd ArrayBox with value [1. 1. 1.]
dict mu:  [1. 1. 1.]
mu:  [1. 1. 1.]
dict mu:  [1. 1. 1.]
mu:  [1. 1. 1.]
dict mu:  Autograd ArrayBox with value [1. 1. 1.]
mu:  Autograd ArrayBox with value [1. 1. 1.]
dict mu:  Autograd ArrayBox with value Autograd ArrayBox with value [1. 1. 1.]
mu:  Autograd ArrayBox with value Autograd ArrayBox with value [1. 1. 1.]
dict mu:  [1. 1. 1.]
mu:  [1. 1. 1.]
dict mu:  Autograd ArrayBox with value [1. 1. 1.]
mu:  Autograd ArrayBox with value [1. 1. 1.

In [5]:
#print(mle_opt)
init_param[7]= 5
get_freeflat_loss(x, init_param)
print(np.mean(x, axis=0))
print(get_freeflat_loss_grad(x, init_param))

print(norm_param_pattern.fold(init_param, free=True))

[1 5 1]
[-0.04469438  1.03094019  1.85511868]
Autograd ArrayBox with value [1 5 1]
[  787  -224 -1496   144   489  -325     0     0     0]
OrderedDict([('sigma', array([[7.3890561 , 2.71828183, 2.71828183],
       [2.71828183, 8.3890561 , 3.71828183],
       [2.71828183, 3.71828183, 9.3890561 ]])), ('mu', array([1, 5, 1]))])


We can now "fold" the optimum back into its original shape.

In [6]:
norm_param_opt = norm_param_pattern.fold(mle_opt.x, free=True)

for param in ['sigma', 'mu']:
    print('Parmeter {}\nOptimal:\n{}\n\nTrue:\n{}\n\n'.format(
        param, norm_param_opt[param], norm_param_dict[param]))

NameError: name 'norm_param_dict' is not defined