diff --git a/pymc3/model.py b/pymc3/model.py
index 92eaa4604e..37d1508e03 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -329,7 +329,7 @@ class ValueGradFunction(object):
         The value that we compute with its gradient.
     grad_vars : list of named theano variables or None
         The arguments with respect to which the gradient is computed.
-    extra_args : list of named theano variables or None
+    extra_vars : list of named theano variables or None
         Other arguments of the function that are assumed constant. They
         are stored in shared variables and can be set using
         `set_extra_values`.
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index ea739e2fa9..af145f824e 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -842,7 +842,7 @@ def init_nuts(init='auto', njobs=1, n_init=500000, model=None,
         if njobs == 1:
             start = start[0]
     elif init == 'advi_map':
-        start = pm.find_MAP()
+        start = pm.find_MAP(include_transformed=True)
         approx = pm.MeanField(model=model, start=start)
         pm.fit(
             random_seed=random_seed,
@@ -859,7 +859,7 @@ def init_nuts(init='auto', njobs=1, n_init=500000, model=None,
         if njobs == 1:
             start = start[0]
     elif init == 'map':
-        start = pm.find_MAP()
+        start = pm.find_MAP(include_transformed=True)
         cov = pm.find_hessian(point=start)
         start = [start] * njobs
         potential = quadpotential.QuadPotentialFull(cov)
diff --git a/pymc3/tests/test_examples.py b/pymc3/tests/test_examples.py
index 79d3736c79..6254733ea4 100644
--- a/pymc3/tests/test_examples.py
+++ b/pymc3/tests/test_examples.py
@@ -2,7 +2,6 @@
 import numpy as np
 import pandas as pd
 import pymc3 as pm
-import scipy.optimize as opt
 import theano.tensor as tt
 import pytest
 import theano
@@ -195,7 +194,7 @@ def build_model(self):
 
     def test_run(self):
         with self.build_model():
-            start = pm.find_MAP(fmin=opt.fmin_powell)
+            start = pm.find_MAP(method="Powell")
             pm.sample(50, pm.Slice(), start=start)
 
 
diff --git a/pymc3/tests/test_starting.py b/pymc3/tests/test_starting.py
index 722a16d52c..1ceea801cf 100644
--- a/pymc3/tests/test_starting.py
+++ b/pymc3/tests/test_starting.py
@@ -2,7 +2,7 @@
 import numpy as np
 from pymc3.tuning import starting
 from pymc3 import Model, Uniform, Normal, Beta, Binomial, find_MAP, Point
-from .models import simple_model, non_normal, exponential_beta, simple_arbitrary_det
+from .models import simple_model, non_normal, simple_arbitrary_det
 from .helpers import select_by_precision
 
 
@@ -20,19 +20,6 @@ def test_accuracy_non_normal():
         close_to(newstart['x'], mu, select_by_precision(float64=1e-5, float32=1E-4))
 
 
-def test_errors():
-    _, model, _ = exponential_beta(2)
-    with model:
-        try:
-            newstart = find_MAP(Point(x=[-.5, .01], y=[.5, 4.4]))
-        except ValueError as e:
-            msg = str(e)
-            assert "x.logp" in msg, msg
-            assert "x.value" not in msg, msg
-        else:
-            assert False, newstart
-
-
 def test_find_MAP_discrete():
     tol = 2.0**-11
     alpha = 4
@@ -41,8 +28,8 @@ def test_find_MAP_discrete():
     yes = 15
 
     with Model() as model:
-        p = Beta('p', alpha, beta, transform=None)
-        Binomial('ss', n=n, p=p, transform=None)
+        p = Beta('p', alpha, beta)
+        Binomial('ss', n=n, p=p)
         Binomial('s', n=n, p=p, observed=yes)
 
         map_est1 = starting.find_MAP()
@@ -68,14 +55,14 @@ def test_find_MAP():
     data = (data - np.mean(data)) / np.std(data)
 
     with Model():
-        mu = Uniform('mu', -1, 1, transform=None)
-        sigma = Uniform('sigma', .5, 1.5, transform=None)
+        mu = Uniform('mu', -1, 1)
+        sigma = Uniform('sigma', .5, 1.5)
         Normal('y', mu=mu, tau=sigma**-2, observed=data)
 
         # Test gradient minimization
         map_est1 = starting.find_MAP()
         # Test non-gradient minimization
-        map_est2 = starting.find_MAP(fmin=starting.optimize.fmin_powell)
+        map_est2 = starting.find_MAP(method="Powell")
 
     close_to(map_est1['mu'], 0, tol)
     close_to(map_est1['sigma'], 1, tol)
diff --git a/pymc3/tests/test_tuning.py b/pymc3/tests/test_tuning.py
index 1391e907a5..9da803d693 100644
--- a/pymc3/tests/test_tuning.py
+++ b/pymc3/tests/test_tuning.py
@@ -22,13 +22,13 @@ def test_mle_jacobian():
 
     start, model, _ = models.simple_normal(bounded_prior=False)
     with model:
-        map_estimate = find_MAP(model=model)
+        map_estimate = find_MAP(method="BFGS", model=model)
 
     rtol = 1E-5  # this rtol should work on both floatX precisions
     np.testing.assert_allclose(map_estimate["mu_i"], truth, rtol=rtol)
 
     start, model, _ = models.simple_normal(bounded_prior=True)
     with model:
-        map_estimate = find_MAP(model=model)
+        map_estimate = find_MAP(method="BFGS", model=model)
 
     np.testing.assert_allclose(map_estimate["mu_i"], truth, rtol=rtol)
diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py
index 7738c13326..43f8c8a5d8 100644
--- a/pymc3/tuning/starting.py
+++ b/pymc3/tuning/starting.py
@@ -3,45 +3,51 @@
 
 @author: johnsalvatier
 '''
-from scipy import optimize
+from scipy.optimize import minimize
 import numpy as np
-from numpy import isfinite, nan_to_num, logical_not
+from numpy import isfinite, nan_to_num
+from tqdm import tqdm
 import pymc3 as pm
 from ..vartypes import discrete_types, typefilter
 from ..model import modelcontext, Point
 from ..theanof import inputvars
 from ..blocking import DictToArrayBijection, ArrayOrdering
-from ..util import update_start_vals
+from ..util import update_start_vals, get_default_varnames
 
+import warnings
 from inspect import getargspec
 
 __all__ = ['find_MAP']
 
 
-def find_MAP(start=None, vars=None, fmin=None,
-             return_raw=False, model=None, callback=None,
+def find_MAP(start=None, vars=None, method="L-BFGS-B",
+             return_raw=False, include_transformed=False, progressbar=True, maxeval=5000, model=None,
              *args, **kwargs):
     """
-    Sets state to the local maximum a posteriori point given a model.
-    Current default of fmin_Hessian does not deal well with optimizing close
-    to sharp edges, especially if they are the minimum.
+    Finds the local maximum a posteriori point given a model.
 
     Parameters
     ----------
     start : `dict` of parameter values (Defaults to `model.test_point`)
     vars : list
         List of variables to optimize and set to optimum (Defaults to all continuous).
-    fmin : function
-        Optimization algorithm (Defaults to `scipy.optimize.fmin_bfgs` unless
+    method : string or callable
+        Optimization algorithm (Defaults to 'L-BFGS-B' unless
         discrete variables are specified in `vars`, then
-        `scipy.optimize.fmin_powell` which will perform better).
-    return_raw : Bool
-        Whether to return extra value returned by fmin (Defaults to `False`)
+        `Powell` which will perform better).  For instructions on use of a callable,
+        refer to SciPy's documentation of `optimize.minimize`.
+    return_raw : bool
+        Whether to return the full output of scipy.optimize.minimize (Defaults to `False`)
+    include_transformed : bool
+        Flag for reporting automatically transformed variables in addition
+        to original variables (defaults to False).
+    progressbar : bool
+        Whether or not to display a progress bar in the command line.
+    maxeval : int
+        The maximum number of times the posterior distribution is evaluated.
     model : Model (optional if in `with` context)
-    callback : callable
-        Callback function to pass to scipy optimization routine.
     *args, **kwargs
-        Extra args passed to fmin
+        Extra args passed to scipy.optimize.minimize
     """
     model = modelcontext(model)
     if start is None:
@@ -59,108 +65,76 @@ def find_MAP(start=None, vars=None, fmin=None,
         vars = model.cont_vars
     vars = inputvars(vars)
     disc_vars = list(typefilter(vars, discrete_types))
+    allinmodel(vars, model)
+
+    start = Point(start, model=model)
+    bij = DictToArrayBijection(ArrayOrdering(vars), start)
+    logp_func = bij.mapf(model.fastlogp_nojac)
+    x0 = bij.map(start)
 
     try:
-        model.fastdlogp(vars)
-        gradient_avail = True
+        dlogp_func = bij.mapf(model.fastdlogp_nojac(vars))
+        compute_gradient = True
     except AttributeError:
-        gradient_avail = False
+        compute_gradient = False
 
-    if disc_vars or not gradient_avail:
+    if disc_vars or not compute_gradient:
         pm._log.warning("Warning: gradient not available." +
                         "(E.g. vars contains discrete variables). MAP " +
                         "estimates may not be accurate for the default " +
                         "parameters. Defaulting to non-gradient minimization " +
-                        "fmin_powell.")
-        fmin = optimize.fmin_powell
-
-    if fmin is None:
-        if disc_vars:
-            fmin = optimize.fmin_powell
+                        "'Powell'.")
+        method = "Powell"
+
+    if "fmin" in kwargs:
+        fmin = kwargs.pop("fmin")
+        warnings.warn('In future versions, set the optimization algorithm with a string. '
+                      'For example, use `method="L-BFGS-B"` instead of '
+                      '`fmin=sp.optimize.fmin_l_bfgs_b"`.')
+
+        cost_func = CostFuncWrapper(maxeval, progressbar, logp_func)
+
+        # Check to see if minimization function actually uses the gradient
+        if 'fprime' in getargspec(fmin).args:
+            def grad_logp(point):
+                return nan_to_num(-dlogp_func(point))
+            opt_result = fmin(cost_func, bij.map(start), fprime=grad_logp, *args, **kwargs)
         else:
-            fmin = optimize.fmin_bfgs
-
-    allinmodel(vars, model)
-
-    start = Point(start, model=model)
-    bij = DictToArrayBijection(ArrayOrdering(vars), start)
-    logp_func = bij.mapf(model.fastlogp)
-    x0 = bij.map(start)
-    logp = bij.mapf(model.fastlogp_nojac)
-    def logp_o(point):
-        return nan_to_high(-logp(point))
-
-    # Check to see if minimization function actually uses the gradient
-    if 'fprime' in getargspec(fmin).args:
-        dlogp = bij.mapf(model.fastdlogp_nojac(vars))
-        def grad_logp_o(point):
-            return nan_to_num(-dlogp(point))
-
-        r = fmin(logp_o, bij.map(start), fprime=grad_logp_o, callback=callback, *args, **kwargs)
-        compute_gradient = True
-    else:
-        # Check to see if minimization function uses a starting value
-        if 'x0' in getargspec(fmin).args:
-            r = fmin(logp_o, bij.map(start), callback=callback, *args, **kwargs)
+            # Check to see if minimization function uses a starting value
+            if 'x0' in getargspec(fmin).args:
+                opt_result = fmin(cost_func, bij.map(start), *args, **kwargs)
+            else:
+                opt_result = fmin(cost_func, *args, **kwargs)
+
+        if isinstance(opt_result, tuple):
+            mx0 = opt_result[0]
         else:
-            r = fmin(logp_o, callback=callback, *args, **kwargs)
-        compute_gradient = False
-
-    if isinstance(r, tuple):
-        mx0 = r[0]
+            mx0 = opt_result
     else:
-        mx0 = r
-
-    mx = bij.rmap(mx0)
-
-    allfinite_mx0 = allfinite(mx0)
-    allfinite_logp = allfinite(model.logp(mx))
-    if compute_gradient:
-        allfinite_dlogp = allfinite(model.dlogp()(mx))
-    else:
-        allfinite_dlogp = True
-
-    if (not allfinite_mx0 or
-        not allfinite_logp or
-        not allfinite_dlogp):
-
-        messages = []
-        for var in vars:
-            vals = {
-                "value": mx[var.name],
-                "logp": var.logp(mx)}
-            if compute_gradient:
-                vals["dlogp"] = var.dlogp()(mx)
-
-            def message(name, values):
-                if np.size(values) < 10:
-                    return name + " bad: " + str(values)
-                else:
-                    idx = np.nonzero(logical_not(isfinite(values)))
-                    return name + " bad at idx: " + str(idx) + " with values: " + str(values[idx])
-
-            messages += [
-                message(var.name + "." + k, v)
-                for k, v in vals.items()
-                if not allfinite(v)]
-
-        specific_errors = '\n'.join(messages)
-        raise ValueError("Optimization error: max, logp or dlogp at " +
-                         "max have non-finite values. Some values may be " +
-                         "outside of distribution support. max: " +
-                         repr(mx) + " logp: " + repr(model.logp(mx)) +
-                         " dlogp: " + repr(model.dlogp()(mx)) + "Check that " +
-                         "1) you don't have hierarchical parameters, " +
-                         "these will lead to points with infinite " +
-                         "density. 2) your distribution logp's are " +
-                         "properly specified. Specific issues: \n" +
-                         specific_errors)
-
-    vars = model.unobserved_RVs
-    mx = {var.name: value for var, value in zip(vars, model.fastfn(vars)(mx))}
+        # remove 'if' part, keep just this 'else' block after version change
+        if compute_gradient:
+            cost_func = CostFuncWrapper(maxeval, progressbar, logp_func, dlogp_func)
+        else:
+            cost_func = CostFuncWrapper(maxeval, progressbar, logp_func)
+
+        try:
+            opt_result = minimize(cost_func, x0, method=method, jac=compute_gradient, *args, **kwargs)
+            mx0 = opt_result["x"]  # r -> opt_result
+            cost_func.progress.total = cost_func.progress.n + 1
+            cost_func.progress.update()
+        except (KeyboardInterrupt, StopIteration) as e:
+            mx0, opt_result = cost_func.previous_x, None
+            cost_func.progress.close()
+            if isinstance(e, StopIteration):
+                pm._log.info(e)
+        finally:
+            cost_func.progress.close()
+
+    vars = get_default_varnames(model.unobserved_RVs, include_transformed)
+    mx = {var.name: value for var, value in zip(vars, model.fastfn(vars)(bij.rmap(mx0)))}
 
     if return_raw:
-        return mx, r
+        return mx, opt_result
     else:
         return mx
 
@@ -179,13 +153,53 @@ def allinmodel(vars, model):
         raise ValueError("Some variables not in the model: " + str(notin))
 
 
-def format_values(val):
-    fmt = "{:8.3f}"
-    if val.size == 1:
-        return fmt.format(np.float(val))
-    elif val.size < 9:
-        return "[" + ", ".join([fmt.format(v) for v in val]) + "]"
-    else:
-        start = "[" + ", ".join([fmt.format(v) for v in val[:4]])
-        end   = ", ".join([fmt.format(v) for v in val[-4:]]) +"]"
-        return start + ",   ...   , " + end
+class CostFuncWrapper(object):
+    def __init__(self, maxeval=5000, progressbar=True, logp_func=None, dlogp_func=None):
+        self.n_eval = 0
+        self.maxeval = maxeval
+        self.logp_func = logp_func
+        if dlogp_func is None:
+            self.use_gradient = False
+            self.desc = 'logp = {:,.5g}'
+        else:
+            self.dlogp_func = dlogp_func
+            self.use_gradient = True
+            self.desc = 'logp = {:,.5g}, ||grad|| = {:,.5g}'
+        self.previous_x = None
+        self.progress = tqdm(total=maxeval, disable=not progressbar)
+
+    def __call__(self, x):
+        neg_value = np.float64(self.logp_func(pm.floatX(x)))
+        value = -1.0 * nan_to_high(neg_value)
+        if self.use_gradient:
+            neg_grad = self.dlogp_func(pm.floatX(x))
+            if np.all(np.isfinite(neg_grad)):
+                self.previous_x = x
+            grad = nan_to_num(-1.0*neg_grad)
+            grad = grad.astype(np.float64)
+        else:
+            self.previous_x = x
+            grad = None
+
+        if self.n_eval % 10 == 0:
+            self.update_progress_desc(neg_value, grad)
+
+        if self.n_eval > self.maxeval:
+            self.update_progress_desc(neg_value, grad)
+            self.progress.close()
+            raise StopIteration
+
+        self.n_eval += 1
+        self.progress.update(1)
+
+        if self.use_gradient:
+            return value, grad
+        else:
+            return value
+
+    def update_progress_desc(self, neg_value, grad=None):
+        if grad is None:
+            self.progress.set_description(self.desc.format(neg_value))
+        else:
+            norm_grad = np.linalg.norm(grad)
+            self.progress.set_description(self.desc.format(neg_value, norm_grad))