diff --git a/pymc3/model.py b/pymc3/model.py index 92eaa4604e..37d1508e03 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -329,7 +329,7 @@ class ValueGradFunction(object): The value that we compute with its gradient. grad_vars : list of named theano variables or None The arguments with respect to which the gradient is computed. - extra_args : list of named theano variables or None + extra_vars : list of named theano variables or None Other arguments of the function that are assumed constant. They are stored in shared variables and can be set using `set_extra_values`. diff --git a/pymc3/sampling.py b/pymc3/sampling.py index ea739e2fa9..af145f824e 100644 --- a/pymc3/sampling.py +++ b/pymc3/sampling.py @@ -842,7 +842,7 @@ def init_nuts(init='auto', njobs=1, n_init=500000, model=None, if njobs == 1: start = start[0] elif init == 'advi_map': - start = pm.find_MAP() + start = pm.find_MAP(include_transformed=True) approx = pm.MeanField(model=model, start=start) pm.fit( random_seed=random_seed, @@ -859,7 +859,7 @@ def init_nuts(init='auto', njobs=1, n_init=500000, model=None, if njobs == 1: start = start[0] elif init == 'map': - start = pm.find_MAP() + start = pm.find_MAP(include_transformed=True) cov = pm.find_hessian(point=start) start = [start] * njobs potential = quadpotential.QuadPotentialFull(cov) diff --git a/pymc3/tests/test_examples.py b/pymc3/tests/test_examples.py index 79d3736c79..6254733ea4 100644 --- a/pymc3/tests/test_examples.py +++ b/pymc3/tests/test_examples.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd import pymc3 as pm -import scipy.optimize as opt import theano.tensor as tt import pytest import theano @@ -195,7 +194,7 @@ def build_model(self): def test_run(self): with self.build_model(): - start = pm.find_MAP(fmin=opt.fmin_powell) + start = pm.find_MAP(method="Powell") pm.sample(50, pm.Slice(), start=start) diff --git a/pymc3/tests/test_starting.py b/pymc3/tests/test_starting.py index 722a16d52c..1ceea801cf 100644 --- a/pymc3/tests/test_starting.py +++ b/pymc3/tests/test_starting.py @@ -2,7 +2,7 @@ import numpy as np from pymc3.tuning import starting from pymc3 import Model, Uniform, Normal, Beta, Binomial, find_MAP, Point -from .models import simple_model, non_normal, exponential_beta, simple_arbitrary_det +from .models import simple_model, non_normal, simple_arbitrary_det from .helpers import select_by_precision @@ -20,19 +20,6 @@ def test_accuracy_non_normal(): close_to(newstart['x'], mu, select_by_precision(float64=1e-5, float32=1E-4)) -def test_errors(): - _, model, _ = exponential_beta(2) - with model: - try: - newstart = find_MAP(Point(x=[-.5, .01], y=[.5, 4.4])) - except ValueError as e: - msg = str(e) - assert "x.logp" in msg, msg - assert "x.value" not in msg, msg - else: - assert False, newstart - - def test_find_MAP_discrete(): tol = 2.0**-11 alpha = 4 @@ -41,8 +28,8 @@ def test_find_MAP_discrete(): yes = 15 with Model() as model: - p = Beta('p', alpha, beta, transform=None) - Binomial('ss', n=n, p=p, transform=None) + p = Beta('p', alpha, beta) + Binomial('ss', n=n, p=p) Binomial('s', n=n, p=p, observed=yes) map_est1 = starting.find_MAP() @@ -68,14 +55,14 @@ def test_find_MAP(): data = (data - np.mean(data)) / np.std(data) with Model(): - mu = Uniform('mu', -1, 1, transform=None) - sigma = Uniform('sigma', .5, 1.5, transform=None) + mu = Uniform('mu', -1, 1) + sigma = Uniform('sigma', .5, 1.5) Normal('y', mu=mu, tau=sigma**-2, observed=data) # Test gradient minimization map_est1 = starting.find_MAP() # Test non-gradient minimization - map_est2 = starting.find_MAP(fmin=starting.optimize.fmin_powell) + map_est2 = starting.find_MAP(method="Powell") close_to(map_est1['mu'], 0, tol) close_to(map_est1['sigma'], 1, tol) diff --git a/pymc3/tests/test_tuning.py b/pymc3/tests/test_tuning.py index 1391e907a5..9da803d693 100644 --- a/pymc3/tests/test_tuning.py +++ b/pymc3/tests/test_tuning.py @@ -22,13 +22,13 @@ def test_mle_jacobian(): start, model, _ = models.simple_normal(bounded_prior=False) with model: - map_estimate = find_MAP(model=model) + map_estimate = find_MAP(method="BFGS", model=model) rtol = 1E-5 # this rtol should work on both floatX precisions np.testing.assert_allclose(map_estimate["mu_i"], truth, rtol=rtol) start, model, _ = models.simple_normal(bounded_prior=True) with model: - map_estimate = find_MAP(model=model) + map_estimate = find_MAP(method="BFGS", model=model) np.testing.assert_allclose(map_estimate["mu_i"], truth, rtol=rtol) diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py index 7738c13326..43f8c8a5d8 100644 --- a/pymc3/tuning/starting.py +++ b/pymc3/tuning/starting.py @@ -3,45 +3,51 @@ @author: johnsalvatier ''' -from scipy import optimize +from scipy.optimize import minimize import numpy as np -from numpy import isfinite, nan_to_num, logical_not +from numpy import isfinite, nan_to_num +from tqdm import tqdm import pymc3 as pm from ..vartypes import discrete_types, typefilter from ..model import modelcontext, Point from ..theanof import inputvars from ..blocking import DictToArrayBijection, ArrayOrdering -from ..util import update_start_vals +from ..util import update_start_vals, get_default_varnames +import warnings from inspect import getargspec __all__ = ['find_MAP'] -def find_MAP(start=None, vars=None, fmin=None, - return_raw=False, model=None, callback=None, +def find_MAP(start=None, vars=None, method="L-BFGS-B", + return_raw=False, include_transformed=False, progressbar=True, maxeval=5000, model=None, *args, **kwargs): """ - Sets state to the local maximum a posteriori point given a model. - Current default of fmin_Hessian does not deal well with optimizing close - to sharp edges, especially if they are the minimum. + Finds the local maximum a posteriori point given a model. Parameters ---------- start : `dict` of parameter values (Defaults to `model.test_point`) vars : list List of variables to optimize and set to optimum (Defaults to all continuous). - fmin : function - Optimization algorithm (Defaults to `scipy.optimize.fmin_bfgs` unless + method : string or callable + Optimization algorithm (Defaults to 'L-BFGS-B' unless discrete variables are specified in `vars`, then - `scipy.optimize.fmin_powell` which will perform better). - return_raw : Bool - Whether to return extra value returned by fmin (Defaults to `False`) + `Powell` which will perform better). For instructions on use of a callable, + refer to SciPy's documentation of `optimize.minimize`. + return_raw : bool + Whether to return the full output of scipy.optimize.minimize (Defaults to `False`) + include_transformed : bool + Flag for reporting automatically transformed variables in addition + to original variables (defaults to False). + progressbar : bool + Whether or not to display a progress bar in the command line. + maxeval : int + The maximum number of times the posterior distribution is evaluated. model : Model (optional if in `with` context) - callback : callable - Callback function to pass to scipy optimization routine. *args, **kwargs - Extra args passed to fmin + Extra args passed to scipy.optimize.minimize """ model = modelcontext(model) if start is None: @@ -59,108 +65,76 @@ def find_MAP(start=None, vars=None, fmin=None, vars = model.cont_vars vars = inputvars(vars) disc_vars = list(typefilter(vars, discrete_types)) + allinmodel(vars, model) + + start = Point(start, model=model) + bij = DictToArrayBijection(ArrayOrdering(vars), start) + logp_func = bij.mapf(model.fastlogp_nojac) + x0 = bij.map(start) try: - model.fastdlogp(vars) - gradient_avail = True + dlogp_func = bij.mapf(model.fastdlogp_nojac(vars)) + compute_gradient = True except AttributeError: - gradient_avail = False + compute_gradient = False - if disc_vars or not gradient_avail: + if disc_vars or not compute_gradient: pm._log.warning("Warning: gradient not available." + "(E.g. vars contains discrete variables). MAP " + "estimates may not be accurate for the default " + "parameters. Defaulting to non-gradient minimization " + - "fmin_powell.") - fmin = optimize.fmin_powell - - if fmin is None: - if disc_vars: - fmin = optimize.fmin_powell + "'Powell'.") + method = "Powell" + + if "fmin" in kwargs: + fmin = kwargs.pop("fmin") + warnings.warn('In future versions, set the optimization algorithm with a string. ' + 'For example, use `method="L-BFGS-B"` instead of ' + '`fmin=sp.optimize.fmin_l_bfgs_b"`.') + + cost_func = CostFuncWrapper(maxeval, progressbar, logp_func) + + # Check to see if minimization function actually uses the gradient + if 'fprime' in getargspec(fmin).args: + def grad_logp(point): + return nan_to_num(-dlogp_func(point)) + opt_result = fmin(cost_func, bij.map(start), fprime=grad_logp, *args, **kwargs) else: - fmin = optimize.fmin_bfgs - - allinmodel(vars, model) - - start = Point(start, model=model) - bij = DictToArrayBijection(ArrayOrdering(vars), start) - logp_func = bij.mapf(model.fastlogp) - x0 = bij.map(start) - logp = bij.mapf(model.fastlogp_nojac) - def logp_o(point): - return nan_to_high(-logp(point)) - - # Check to see if minimization function actually uses the gradient - if 'fprime' in getargspec(fmin).args: - dlogp = bij.mapf(model.fastdlogp_nojac(vars)) - def grad_logp_o(point): - return nan_to_num(-dlogp(point)) - - r = fmin(logp_o, bij.map(start), fprime=grad_logp_o, callback=callback, *args, **kwargs) - compute_gradient = True - else: - # Check to see if minimization function uses a starting value - if 'x0' in getargspec(fmin).args: - r = fmin(logp_o, bij.map(start), callback=callback, *args, **kwargs) + # Check to see if minimization function uses a starting value + if 'x0' in getargspec(fmin).args: + opt_result = fmin(cost_func, bij.map(start), *args, **kwargs) + else: + opt_result = fmin(cost_func, *args, **kwargs) + + if isinstance(opt_result, tuple): + mx0 = opt_result[0] else: - r = fmin(logp_o, callback=callback, *args, **kwargs) - compute_gradient = False - - if isinstance(r, tuple): - mx0 = r[0] + mx0 = opt_result else: - mx0 = r - - mx = bij.rmap(mx0) - - allfinite_mx0 = allfinite(mx0) - allfinite_logp = allfinite(model.logp(mx)) - if compute_gradient: - allfinite_dlogp = allfinite(model.dlogp()(mx)) - else: - allfinite_dlogp = True - - if (not allfinite_mx0 or - not allfinite_logp or - not allfinite_dlogp): - - messages = [] - for var in vars: - vals = { - "value": mx[var.name], - "logp": var.logp(mx)} - if compute_gradient: - vals["dlogp"] = var.dlogp()(mx) - - def message(name, values): - if np.size(values) < 10: - return name + " bad: " + str(values) - else: - idx = np.nonzero(logical_not(isfinite(values))) - return name + " bad at idx: " + str(idx) + " with values: " + str(values[idx]) - - messages += [ - message(var.name + "." + k, v) - for k, v in vals.items() - if not allfinite(v)] - - specific_errors = '\n'.join(messages) - raise ValueError("Optimization error: max, logp or dlogp at " + - "max have non-finite values. Some values may be " + - "outside of distribution support. max: " + - repr(mx) + " logp: " + repr(model.logp(mx)) + - " dlogp: " + repr(model.dlogp()(mx)) + "Check that " + - "1) you don't have hierarchical parameters, " + - "these will lead to points with infinite " + - "density. 2) your distribution logp's are " + - "properly specified. Specific issues: \n" + - specific_errors) - - vars = model.unobserved_RVs - mx = {var.name: value for var, value in zip(vars, model.fastfn(vars)(mx))} + # remove 'if' part, keep just this 'else' block after version change + if compute_gradient: + cost_func = CostFuncWrapper(maxeval, progressbar, logp_func, dlogp_func) + else: + cost_func = CostFuncWrapper(maxeval, progressbar, logp_func) + + try: + opt_result = minimize(cost_func, x0, method=method, jac=compute_gradient, *args, **kwargs) + mx0 = opt_result["x"] # r -> opt_result + cost_func.progress.total = cost_func.progress.n + 1 + cost_func.progress.update() + except (KeyboardInterrupt, StopIteration) as e: + mx0, opt_result = cost_func.previous_x, None + cost_func.progress.close() + if isinstance(e, StopIteration): + pm._log.info(e) + finally: + cost_func.progress.close() + + vars = get_default_varnames(model.unobserved_RVs, include_transformed) + mx = {var.name: value for var, value in zip(vars, model.fastfn(vars)(bij.rmap(mx0)))} if return_raw: - return mx, r + return mx, opt_result else: return mx @@ -179,13 +153,53 @@ def allinmodel(vars, model): raise ValueError("Some variables not in the model: " + str(notin)) -def format_values(val): - fmt = "{:8.3f}" - if val.size == 1: - return fmt.format(np.float(val)) - elif val.size < 9: - return "[" + ", ".join([fmt.format(v) for v in val]) + "]" - else: - start = "[" + ", ".join([fmt.format(v) for v in val[:4]]) - end = ", ".join([fmt.format(v) for v in val[-4:]]) +"]" - return start + ", ... , " + end +class CostFuncWrapper(object): + def __init__(self, maxeval=5000, progressbar=True, logp_func=None, dlogp_func=None): + self.n_eval = 0 + self.maxeval = maxeval + self.logp_func = logp_func + if dlogp_func is None: + self.use_gradient = False + self.desc = 'logp = {:,.5g}' + else: + self.dlogp_func = dlogp_func + self.use_gradient = True + self.desc = 'logp = {:,.5g}, ||grad|| = {:,.5g}' + self.previous_x = None + self.progress = tqdm(total=maxeval, disable=not progressbar) + + def __call__(self, x): + neg_value = np.float64(self.logp_func(pm.floatX(x))) + value = -1.0 * nan_to_high(neg_value) + if self.use_gradient: + neg_grad = self.dlogp_func(pm.floatX(x)) + if np.all(np.isfinite(neg_grad)): + self.previous_x = x + grad = nan_to_num(-1.0*neg_grad) + grad = grad.astype(np.float64) + else: + self.previous_x = x + grad = None + + if self.n_eval % 10 == 0: + self.update_progress_desc(neg_value, grad) + + if self.n_eval > self.maxeval: + self.update_progress_desc(neg_value, grad) + self.progress.close() + raise StopIteration + + self.n_eval += 1 + self.progress.update(1) + + if self.use_gradient: + return value, grad + else: + return value + + def update_progress_desc(self, neg_value, grad=None): + if grad is None: + self.progress.set_description(self.desc.format(neg_value)) + else: + norm_grad = np.linalg.norm(grad) + self.progress.set_description(self.desc.format(neg_value, norm_grad))