Permalink
Find file Copy path
791 lines (662 sloc) 30 KB
import logging
import warnings
import collections
import numpy as np
import tqdm
import pymc3 as pm
from pymc3.variational import test_functions
from pymc3.variational.approximations import (
MeanField, FullRank, Empirical, NormalizingFlow
)
from pymc3.variational.operators import KL, KSD
from . import opvi
logger = logging.getLogger(__name__)
__all__ = [
'ADVI',
'FullRankADVI',
'SVGD',
'ASVGD',
'NFVI',
'Inference',
'ImplicitGradient',
'KLqp',
'fit'
]
State = collections.namedtuple('State', 'i,step,callbacks,score')
class Inference:
R"""**Base class for Variational Inference**
Communicates Operator, Approximation and Test Function to build Objective Function
Parameters
----------
op : Operator class
approx : Approximation class or instance
tf : TestFunction instance
model : Model
PyMC3 Model
kwargs : kwargs passed to :class:`Operator`
"""
def __init__(self, op, approx, tf, **kwargs):
self.hist = np.asarray(())
self.objective = op(approx, **kwargs)(tf)
self.state = None
approx = property(lambda self: self.objective.approx)
def _maybe_score(self, score):
returns_loss = self.objective.op.returns_loss
if score is None:
score = returns_loss
elif score and not returns_loss:
warnings.warn('method `fit` got `score == True` but %s '
'does not return loss. Ignoring `score` argument'
% self.objective.op)
score = False
else:
pass
return score
def run_profiling(self, n=1000, score=None, **kwargs):
score = self._maybe_score(score)
fn_kwargs = kwargs.pop('fn_kwargs', dict())
fn_kwargs['profile'] = True
step_func = self.objective.step_function(
score=score, fn_kwargs=fn_kwargs,
**kwargs
)
progress = tqdm.trange(n)
try:
for _ in progress:
step_func()
except KeyboardInterrupt:
pass
finally:
progress.close()
return step_func.profile
def fit(self, n=10000, score=None, callbacks=None, progressbar=True,
**kwargs):
"""Perform Operator Variational Inference
Parameters
----------
n : int
number of iterations
score : bool
evaluate loss on each iteration or not
callbacks : list[function : (Approximation, losses, i) -> None]
calls provided functions after each iteration step
progressbar : bool
whether to show progressbar or not
Other Parameters
----------------
obj_n_mc : `int`
Number of monte carlo samples used for approximation of objective gradients
tf_n_mc : `int`
Number of monte carlo samples used for approximation of test function gradients
obj_optimizer : function (grads, params) -> updates
Optimizer that is used for objective params
test_optimizer : function (grads, params) -> updates
Optimizer that is used for test function params
more_obj_params : `list`
Add custom params for objective optimizer
more_tf_params : `list`
Add custom params for test function optimizer
more_updates : `dict`
Add custom updates to resulting updates
total_grad_norm_constraint : `float`
Bounds gradient norm, prevents exploding gradient problem
fn_kwargs : `dict`
Add kwargs to theano.function (e.g. `{'profile': True}`)
more_replacements : `dict`
Apply custom replacements before calculating gradients
Returns
-------
:class:`Approximation`
"""
if callbacks is None:
callbacks = []
score = self._maybe_score(score)
step_func = self.objective.step_function(score=score, **kwargs)
with tqdm.trange(n, disable=not progressbar) as progress:
if score:
state = self._iterate_with_loss(0, n, step_func, progress, callbacks)
else:
state = self._iterate_without_loss(0, n, step_func, progress, callbacks)
# hack to allow pm.fit() access to loss hist
self.approx.hist = self.hist
self.state = state
return self.approx
def _iterate_without_loss(self, s, _, step_func, progress, callbacks):
i = 0
try:
for i in progress:
step_func()
current_param = self.approx.params[0].get_value()
if np.isnan(current_param).any():
name_slc = []
tmp_hold = list(range(current_param.size))
vmap = self.approx.groups[0].bij.ordering.vmap
for vmap_ in vmap:
slclen = len(tmp_hold[vmap_.slc])
for i in range(slclen):
name_slc.append((vmap_.var, i))
index = np.where(np.isnan(current_param))[0]
errmsg = ['NaN occurred in optimization. ']
suggest_solution = 'Try tracking this parameter: ' \
'http://docs.pymc.io/notebooks/variational_api_quickstart.html#Tracking-parameters'
try:
for ii in index:
errmsg.append('The current approximation of RV `{}`.ravel()[{}]'
' is NaN.'.format(*name_slc[ii]))
errmsg.append(suggest_solution)
except IndexError:
pass
raise FloatingPointError('\n'.join(errmsg))
for callback in callbacks:
callback(self.approx, None, i+s+1)
except (KeyboardInterrupt, StopIteration) as e:
progress.close()
if isinstance(e, StopIteration):
logger.info(str(e))
finally:
progress.close()
return State(i+s, step=step_func,
callbacks=callbacks,
score=False)
def _iterate_with_loss(self, s, n, step_func, progress, callbacks):
def _infmean(input_array):
"""Return the mean of the finite values of the array"""
input_array = input_array[np.isfinite(input_array)].astype('float64')
if len(input_array) == 0:
return np.nan
else:
return np.mean(input_array)
scores = np.empty(n)
scores[:] = np.nan
i = 0
try:
for i in progress:
e = step_func()
if np.isnan(e): # pragma: no cover
scores = scores[:i]
self.hist = np.concatenate([self.hist, scores])
current_param = self.approx.params[0].get_value()
name_slc = []
tmp_hold = list(range(current_param.size))
vmap = self.approx.groups[0].bij.ordering.vmap
for vmap_ in vmap:
slclen = len(tmp_hold[vmap_.slc])
for i in range(slclen):
name_slc.append((vmap_.var, i))
index = np.where(np.isnan(current_param))[0]
errmsg = ['NaN occurred in optimization. ']
suggest_solution = 'Try tracking this parameter: ' \
'http://docs.pymc.io/notebooks/variational_api_quickstart.html#Tracking-parameters'
try:
for ii in index:
errmsg.append('The current approximation of RV `{}`.ravel()[{}]'
' is NaN.'.format(*name_slc[ii]))
errmsg.append(suggest_solution)
except IndexError:
pass
raise FloatingPointError('\n'.join(errmsg))
scores[i] = e
if i % 10 == 0:
avg_loss = _infmean(scores[max(0, i - 1000):i + 1])
progress.set_description('Average Loss = {:,.5g}'.format(avg_loss))
avg_loss = scores[max(0, i - 1000):i + 1].mean()
progress.set_description(
'Average Loss = {:,.5g}'.format(avg_loss))
for callback in callbacks:
callback(self.approx, scores[:i + 1], i+s+1)
except (KeyboardInterrupt, StopIteration) as e: # pragma: no cover
# do not print log on the same line
progress.close()
scores = scores[:i]
if isinstance(e, StopIteration):
logger.info(str(e))
if n < 10:
logger.info('Interrupted at {:,d} [{:.0f}%]: Loss = {:,.5g}'.format(
i, 100 * i // n, scores[i]))
else:
avg_loss = _infmean(scores[min(0, i - 1000):i + 1])
logger.info('Interrupted at {:,d} [{:.0f}%]: Average Loss = {:,.5g}'.format(
i, 100 * i // n, avg_loss))
else:
if n < 10:
logger.info(
'Finished [100%]: Loss = {:,.5g}'.format(scores[-1]))
else:
avg_loss = _infmean(scores[max(0, i - 1000):i + 1])
logger.info(
'Finished [100%]: Average Loss = {:,.5g}'.format(avg_loss))
finally:
progress.close()
self.hist = np.concatenate([self.hist, scores])
return State(i+s, step=step_func,
callbacks=callbacks,
score=True)
def refine(self, n, progressbar=True):
"""Refine the solution using the last compiled step function
"""
if self.state is None:
raise TypeError('Need to call `.fit` first')
i, step, callbacks, score = self.state
with tqdm.trange(n, disable=not progressbar) as progress:
if score:
state = self._iterate_with_loss(i, n, step, progress, callbacks)
else:
state = self._iterate_without_loss(i, n, step, progress, callbacks)
self.state = state
class KLqp(Inference):
"""**Kullback Leibler Divergence Inference**
General approach to fit Approximations that define :math:`logq`
by maximizing ELBO (Evidence Lower Bound). In some cases
rescaling the regularization term KL may be beneficial
.. math::
ELBO_\beta = \log p(D|\theta) - \beta KL(q||p)
Parameters
----------
approx : :class:`Approximation`
Approximation to fit, it is required to have `logQ`
beta : float
Scales the regularization term in ELBO (see Christopher P. Burgess et al., 2017)
References
----------
- Christopher P. Burgess et al. (NIPS, 2017)
Understanding disentangling in :math:`\beta`-VAE
arXiv preprint 1804.03599
"""
def __init__(self, approx, beta=1.):
super().__init__(KL, approx, None, beta=beta)
class ADVI(KLqp):
R"""**Automatic Differentiation Variational Inference (ADVI)**
This class implements the meanfield ADVI, where the variational
posterior distribution is assumed to be spherical Gaussian without
correlation of parameters and fit to the true posterior distribution.
The means and standard deviations of the variational posterior are referred
to as variational parameters.
For explanation, we classify random variables in probabilistic models into
three types. Observed random variables
:math:`{\cal Y}=\{\mathbf{y}_{i}\}_{i=1}^{N}` are :math:`N` observations.
Each :math:`\mathbf{y}_{i}` can be a set of observed random variables,
i.e., :math:`\mathbf{y}_{i}=\{\mathbf{y}_{i}^{k}\}_{k=1}^{V_{o}}`, where
:math:`V_{k}` is the number of the types of observed random variables
in the model.
The next ones are global random variables
:math:`\Theta=\{\theta^{k}\}_{k=1}^{V_{g}}`, which are used to calculate
the probabilities for all observed samples.
The last ones are local random variables
:math:`{\cal Z}=\{\mathbf{z}_{i}\}_{i=1}^{N}`, where
:math:`\mathbf{z}_{i}=\{\mathbf{z}_{i}^{k}\}_{k=1}^{V_{l}}`.
These RVs are used only in AEVB.
The goal of ADVI is to approximate the posterior distribution
:math:`p(\Theta,{\cal Z}|{\cal Y})` by variational posterior
:math:`q(\Theta)\prod_{i=1}^{N}q(\mathbf{z}_{i})`. All of these terms
are normal distributions (mean-field approximation).
:math:`q(\Theta)` is parametrized with its means and standard deviations.
These parameters are denoted as :math:`\gamma`. While :math:`\gamma` is
a constant, the parameters of :math:`q(\mathbf{z}_{i})` are dependent on
each observation. Therefore these parameters are denoted as
:math:`\xi(\mathbf{y}_{i}; \nu)`, where :math:`\nu` is the parameters
of :math:`\xi(\cdot)`. For example, :math:`\xi(\cdot)` can be a
multilayer perceptron or convolutional neural network.
In addition to :math:`\xi(\cdot)`, we can also include deterministic
mappings for the likelihood of observations. We denote the parameters of
the deterministic mappings as :math:`\eta`. An example of such mappings is
the deconvolutional neural network used in the convolutional VAE example
in the PyMC3 notebook directory.
This function maximizes the evidence lower bound (ELBO)
:math:`{\cal L}(\gamma, \nu, \eta)` defined as follows:
.. math::
{\cal L}(\gamma,\nu,\eta) & =
\mathbf{c}_{o}\mathbb{E}_{q(\Theta)}\left[
\sum_{i=1}^{N}\mathbb{E}_{q(\mathbf{z}_{i})}\left[
\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta)
\right]\right] \\ &
- \mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\right]
- \mathbf{c}_{l}\sum_{i=1}^{N}
KL\left[q(\mathbf{z}_{i})||p(\mathbf{z}_{i})\right],
where :math:`KL[q(v)||p(v)]` is the Kullback-Leibler divergence
.. math::
KL[q(v)||p(v)] = \int q(v)\log\frac{q(v)}{p(v)}dv,
:math:`\mathbf{c}_{o/g/l}` are vectors for weighting each term of ELBO.
More precisely, we can write each of the terms in ELBO as follows:
.. math::
\mathbf{c}_{o}\log p(\mathbf{y}_{i}|\mathbf{z}_{i},\Theta,\eta) & = &
\sum_{k=1}^{V_{o}}c_{o}^{k}
\log p(\mathbf{y}_{i}^{k}|
{\rm pa}(\mathbf{y}_{i}^{k},\Theta,\eta)) \\
\mathbf{c}_{g}KL\left[q(\Theta)||p(\Theta)\right] & = &
\sum_{k=1}^{V_{g}}c_{g}^{k}KL\left[
q(\theta^{k})||p(\theta^{k}|{\rm pa(\theta^{k})})\right] \\
\mathbf{c}_{l}KL\left[q(\mathbf{z}_{i}||p(\mathbf{z}_{i})\right] & = &
\sum_{k=1}^{V_{l}}c_{l}^{k}KL\left[
q(\mathbf{z}_{i}^{k})||
p(\mathbf{z}_{i}^{k}|{\rm pa}(\mathbf{z}_{i}^{k}))\right],
where :math:`{\rm pa}(v)` denotes the set of parent variables of :math:`v`
in the directed acyclic graph of the model.
When using mini-batches, :math:`c_{o}^{k}` and :math:`c_{l}^{k}` should be
set to :math:`N/M`, where :math:`M` is the number of observations in each
mini-batch. This is done with supplying `total_size` parameter to
observed nodes (e.g. :code:`Normal('x', 0, 1, observed=data, total_size=10000)`).
In this case it is possible to automatically determine appropriate scaling for :math:`logp`
of observed nodes. Interesting to note that it is possible to have two independent
observed variables with different `total_size` and iterate them independently
during inference.
For working with ADVI, we need to give
- The probabilistic model
`model` with three types of RVs (`observed_RVs`,
`global_RVs` and `local_RVs`).
- (optional) Minibatches
The tensors to which mini-bathced samples are supplied are
handled separately by using callbacks in :func:`Inference.fit` method
that change storage of shared theano variable or by :func:`pymc3.generator`
that automatically iterates over minibatches and defined beforehand.
- (optional) Parameters of deterministic mappings
They have to be passed along with other params to :func:`Inference.fit` method
as `more_obj_params` argument.
For more information concerning training stage please reference
:func:`pymc3.variational.opvi.ObjectiveFunction.step_function`
Parameters
----------
local_rv : dict[var->tuple]
mapping {model_variable -> approx params}
Local Vars are used for Autoencoding Variational Bayes
See (AEVB; Kingma and Welling, 2014) for details
model : :class:`pymc3.Model`
PyMC3 model for inference
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
start : `Point`
starting point for inference
References
----------
- Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
and Blei, D. M. (2016). Automatic Differentiation Variational
Inference. arXiv preprint arXiv:1603.00788.
- Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
approximateinference.org/accepted/RoederEtAl2016.pdf
- Kingma, D. P., & Welling, M. (2014).
Auto-Encoding Variational Bayes. stat, 1050, 1.
"""
def __init__(self, *args, **kwargs):
super().__init__(MeanField(*args, **kwargs))
class FullRankADVI(KLqp):
R"""**Full Rank Automatic Differentiation Variational Inference (ADVI)**
Parameters
----------
local_rv : dict[var->tuple]
mapping {model_variable -> approx params}
Local Vars are used for Autoencoding Variational Bayes
See (AEVB; Kingma and Welling, 2014) for details
model : :class:`pymc3.Model`
PyMC3 model for inference
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
start : `Point`
starting point for inference
References
----------
- Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A.,
and Blei, D. M. (2016). Automatic Differentiation Variational
Inference. arXiv preprint arXiv:1603.00788.
- Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
approximateinference.org/accepted/RoederEtAl2016.pdf
- Kingma, D. P., & Welling, M. (2014).
Auto-Encoding Variational Bayes. stat, 1050, 1.
"""
def __init__(self, *args, **kwargs):
super().__init__(FullRank(*args, **kwargs))
class ImplicitGradient(Inference):
"""**Implicit Gradient for Variational Inference**
**not suggested to use**
An approach to fit arbitrary approximation by computing kernel based gradient
By default RBF kernel is used for gradient estimation. Default estimator is
Kernelized Stein Discrepancy with temperature equal to 1. This temperature works
only for large number of samples. Larger temperature is needed for small number of
samples but there is no theoretical approach to choose the best one in such case.
"""
def __init__(self, approx, estimator=KSD, kernel=test_functions.rbf, **kwargs):
super().__init__(op=estimator, approx=approx, tf=kernel, **kwargs)
class SVGD(ImplicitGradient):
R"""**Stein Variational Gradient Descent**
This inference is based on Kernelized Stein Discrepancy
it's main idea is to move initial noisy particles so that
they fit target distribution best.
Algorithm is outlined below
*Input:* A target distribution with density function :math:`p(x)`
and a set of initial particles :math:`\{x^0_i\}^n_{i=1}`
*Output:* A set of particles :math:`\{x^{*}_i\}^n_{i=1}` that approximates the target distribution.
.. math::
x_i^{l+1} &\leftarrow x_i^{l} + \epsilon_l \hat{\phi}^{*}(x_i^l) \\
\hat{\phi}^{*}(x) &= \frac{1}{n}\sum^{n}_{j=1}[k(x^l_j,x) \nabla_{x^l_j} logp(x^l_j)+ \nabla_{x^l_j} k(x^l_j,x)]
Parameters
----------
n_particles : `int`
number of particles to use for approximation
jitter : `float`
noise sd for initial point
model : :class:`pymc3.Model`
PyMC3 model for inference
kernel : `callable`
kernel function for KSD :math:`f(histogram) -> (k(x,.), \nabla_x k(x,.))`
temperature : float
parameter responsible for exploration, higher temperature gives more broad posterior estimate
start : `dict`
initial point for inference
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
start : `Point`
starting point for inference
kwargs : other keyword arguments passed to estimator
References
----------
- Qiang Liu, Dilin Wang (2016)
Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm
arXiv:1608.04471
- Yang Liu, Prajit Ramachandran, Qiang Liu, Jian Peng (2017)
Stein Variational Policy Gradient
arXiv:1704.02399
"""
def __init__(self, n_particles=100, jitter=1, model=None, start=None,
random_seed=None, estimator=KSD, kernel=test_functions.rbf, **kwargs):
if kwargs.get('local_rv') is not None:
raise opvi.AEVBInferenceError('SVGD does not support local groups')
empirical = Empirical(
size=n_particles, jitter=jitter,
start=start, model=model, random_seed=random_seed)
super().__init__(approx=empirical, estimator=estimator, kernel=kernel, **kwargs)
class ASVGD(ImplicitGradient):
R"""**Amortized Stein Variational Gradient Descent**
**not suggested to use**
This inference is based on Kernelized Stein Discrepancy
it's main idea is to move initial noisy particles so that
they fit target distribution best.
Algorithm is outlined below
*Input:* Parametrized random generator :math:`R_{\theta}`
*Output:* :math:`R_{\theta^{*}}` that approximates the target distribution.
.. math::
\Delta x_i &= \hat{\phi}^{*}(x_i) \\
\hat{\phi}^{*}(x) &= \frac{1}{n}\sum^{n}_{j=1}[k(x_j,x) \nabla_{x_j} logp(x_j)+ \nabla_{x_j} k(x_j,x)] \\
\Delta_{\theta} &= \frac{1}{n}\sum^{n}_{i=1}\Delta x_i\frac{\partial x_i}{\partial \theta}
Parameters
----------
approx : :class:`Approximation`
default is :class:`FullRank` but can be any
kernel : `callable`
kernel function for KSD :math:`f(histogram) -> (k(x,.), \nabla_x k(x,.))`
model : :class:`Model`
kwargs : kwargs for gradient estimator
References
----------
- Dilin Wang, Yihao Feng, Qiang Liu (2016)
Learning to Sample Using Stein Discrepancy
http://bayesiandeeplearning.org/papers/BDL_21.pdf
- Dilin Wang, Qiang Liu (2016)
Learning to Draw Samples: With Application to Amortized MLE for Generative Adversarial Learning
arXiv:1611.01722
- Yang Liu, Prajit Ramachandran, Qiang Liu, Jian Peng (2017)
Stein Variational Policy Gradient
arXiv:1704.02399
"""
def __init__(self, approx=None, estimator=KSD, kernel=test_functions.rbf, **kwargs):
warnings.warn('You are using experimental inference Operator. '
'It requires careful choice of temperature, default is 1. '
'Default temperature works well for low dimensional problems and '
'for significant `n_obj_mc`. Temperature > 1 gives more exploration '
'power to algorithm, < 1 leads to undesirable results. Please take '
'it in account when looking at inference result. Posterior variance '
'is often **underestimated** when using temperature = 1.')
if approx is None:
approx = FullRank(
model=kwargs.pop('model', None),
local_rv=kwargs.pop('local_rv', None)
)
super().__init__(estimator=estimator, approx=approx, kernel=kernel, **kwargs)
def fit(self, n=10000, score=None, callbacks=None, progressbar=True,
obj_n_mc=500, **kwargs):
return super().fit(
n=n, score=score, callbacks=callbacks,
progressbar=progressbar, obj_n_mc=obj_n_mc, **kwargs)
def run_profiling(self, n=1000, score=None, obj_n_mc=500, **kwargs):
return super().run_profiling(n=n, score=score, obj_n_mc=obj_n_mc, **kwargs)
class NFVI(KLqp):
R"""**Normalizing Flow based :class:`KLqp` inference**
Normalizing flow is a series of invertible transformations on initial distribution.
.. math::
z_K = f_K \circ \dots \circ f_2 \circ f_1(z_0)
In that case we can compute tractable density for the flow.
.. math::
\ln q_K(z_K) = \ln q_0(z_0) - \sum_{k=1}^{K}\ln \left|\frac{\partial f_k}{\partial z_{k-1}}\right|
Every :math:`f_k` here is a parametric function with defined determinant.
We can choose every step here. For example the here is a simple flow
is an affine transform:
.. math::
z = loc(scale(z_0)) = \mu + \sigma * z_0
Here we get mean field approximation if :math:`z_0 \sim \mathcal{N}(0, 1)`
**Flow Formulas**
In PyMC3 there is a flexible way to define flows with formulas. We have 5 of them by the moment:
- Loc (:code:`loc`): :math:`z' = z + \mu`
- Scale (:code:`scale`): :math:`z' = \sigma * z`
- Planar (:code:`planar`): :math:`z' = z + u * \tanh(w^T z + b)`
- Radial (:code:`radial`): :math:`z' = z + \beta (\alpha + (z-z_r))^{-1}(z-z_r)`
- Householder (:code:`hh`): :math:`z' = H z`
Formula can be written as a string, e.g. `'scale-loc'`, `'scale-hh*4-loc'`, `'panar*10'`.
Every step is separated with `'-'`, repeated flow is marked with `'*'` producing `'flow*repeats'`.
Parameters
----------
flow : str|AbstractFlow
formula or initialized Flow, default is `'scale-loc'` that
is identical to MeanField
model : :class:`pymc3.Model`
PyMC3 model for inference
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
"""
def __init__(self, *args, **kwargs):
super().__init__(NormalizingFlow(*args, **kwargs))
def fit(n=10000, local_rv=None, method='advi', model=None,
random_seed=None, start=None, inf_kwargs=None, **kwargs):
R"""Handy shortcut for using inference methods in functional way
Parameters
----------
n : `int`
number of iterations
local_rv : dict[var->tuple]
mapping {model_variable -> approx params}
Local Vars are used for Autoencoding Variational Bayes
See (AEVB; Kingma and Welling, 2014) for details
method : str or :class:`Inference`
string name is case insensitive in:
- 'advi' for ADVI
- 'fullrank_advi' for FullRankADVI
- 'svgd' for Stein Variational Gradient Descent
- 'asvgd' for Amortized Stein Variational Gradient Descent
- 'nfvi' for Normalizing Flow with default `scale-loc` flow
- 'nfvi=<formula>' for Normalizing Flow using formula
model : :class:`Model`
PyMC3 model for inference
random_seed : None or int
leave None to use package global RandomStream or other
valid value to create instance specific one
inf_kwargs : dict
additional kwargs passed to :class:`Inference`
start : `Point`
starting point for inference
Other Parameters
----------------
score : bool
evaluate loss on each iteration or not
callbacks : list[function : (Approximation, losses, i) -> None]
calls provided functions after each iteration step
progressbar : bool
whether to show progressbar or not
obj_n_mc : `int`
Number of monte carlo samples used for approximation of objective gradients
tf_n_mc : `int`
Number of monte carlo samples used for approximation of test function gradients
obj_optimizer : function (grads, params) -> updates
Optimizer that is used for objective params
test_optimizer : function (grads, params) -> updates
Optimizer that is used for test function params
more_obj_params : `list`
Add custom params for objective optimizer
more_tf_params : `list`
Add custom params for test function optimizer
more_updates : `dict`
Add custom updates to resulting updates
total_grad_norm_constraint : `float`
Bounds gradient norm, prevents exploding gradient problem
fn_kwargs : `dict`
Add kwargs to theano.function (e.g. `{'profile': True}`)
more_replacements : `dict`
Apply custom replacements before calculating gradients
Returns
-------
:class:`Approximation`
"""
if inf_kwargs is None:
inf_kwargs = dict()
else:
inf_kwargs = inf_kwargs.copy()
if local_rv is not None:
inf_kwargs['local_rv'] = local_rv
if random_seed is not None:
inf_kwargs['random_seed'] = random_seed
if start is not None:
inf_kwargs['start'] = start
if model is None:
model = pm.modelcontext(model)
_select = dict(
advi=ADVI,
fullrank_advi=FullRankADVI,
svgd=SVGD,
asvgd=ASVGD,
nfvi=NFVI
)
if isinstance(method, str):
method = method.lower()
if method.startswith('nfvi='):
formula = method[5:]
inference = NFVI(
formula,
**inf_kwargs
)
elif method in _select:
inference = _select[method](
model=model,
**inf_kwargs
)
else:
raise KeyError('method should be one of %s '
'or Inference instance' %
set(_select.keys()))
elif isinstance(method, Inference):
inference = method
else:
raise TypeError('method should be one of %s '
'or Inference instance' %
set(_select.keys()))
return inference.fit(n, **kwargs)