scikit-learn · rth · Jul 2, 2019 · Jul 3, 2017 · Sep 7, 2017 · Sep 7, 2017
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -19,6 +19,8 @@ random sampling procedures.
 
 - :class:`decomposition.SparsePCA` where `normalize_components` has no effect
   due to deprecation.
+..
+    TO FILL IN AS WE GO
 
 Details are listed in the changelog below.
 
@@ -122,6 +124,18 @@ Changelog
   rather than variance in this case.
   :pr:`13704` by `Roddy MacSween <rlms>`.
 
+
+:mod:`sklearn.neural_network`
+.............................
+
+- |Feature| Add `max_fun` parameter in
+  :class:`neural_network.BaseMultilayerPerceptron`,
+  :class:`neural_network.MLPRegressor`, and
+  :class:`neural_network.MLPClassifier` to give control over
+  maximum number of function evaluation to not meet ``tol`` improvement.
+  :issue:`9274` by :user:`Daniel Perry <daniel-perry>`.
+
+
 Miscellaneous
 .............
 

diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
@@ -51,7 +51,7 @@ def __init__(self, hidden_layer_sizes, activation, solver,
                  max_iter, loss, shuffle, random_state, tol, verbose,
                  warm_start, momentum, nesterovs_momentum, early_stopping,
                  validation_fraction, beta_1, beta_2, epsilon,
-                 n_iter_no_change):
+                 n_iter_no_change, max_fun):
         self.activation = activation
         self.solver = solver
         self.alpha = alpha
@@ -75,6 +75,7 @@ def __init__(self, hidden_layer_sizes, activation, solver,
         self.beta_2 = beta_2
         self.epsilon = epsilon
         self.n_iter_no_change = n_iter_no_change
+        self.max_fun = max_fun
 
     def _unpack(self, packed_parameters):
         """Extract the coefficients and intercepts from packed_parameters."""
@@ -172,7 +173,6 @@ def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
         self._unpack(packed_coef_inter)
         loss, coef_grads, intercept_grads = self._backprop(
             X, y, activations, deltas, coef_grads, intercept_grads)
-        self.n_iter_ += 1
         grad = _pack(coef_grads, intercept_grads)
         return loss, grad
 
@@ -381,6 +381,8 @@ def _validate_hyperparameters(self):
                              self.shuffle)
         if self.max_iter <= 0:
             raise ValueError("max_iter must be > 0, got %s." % self.max_iter)
+        if self.max_fun <= 0:
+            raise ValueError("max_fun must be > 0, got %s." % self.max_fun)
         if self.alpha < 0.0:
             raise ValueError("alpha must be >= 0, got %s." % self.alpha)
         if (self.learning_rate in ["constant", "invscaling", "adaptive"] and
@@ -459,10 +461,12 @@ def _fit_lbfgs(self, X, y, activations, deltas, coef_grads,
         optimal_parameters, self.loss_, d = fmin_l_bfgs_b(
             x0=packed_coef_inter,
             func=self._loss_grad_lbfgs,
-            maxfun=self.max_iter,
+            maxfun=self.max_fun,
+            maxiter=self.max_iter,
             iprint=iprint,
             pgtol=self.tol,
             args=(X, y, activations, deltas, coef_grads, intercept_grads))
+        self.n_iter_ = d['nit']
 
         self._unpack(optimal_parameters)
 
@@ -833,6 +837,15 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
 
         .. versionadded:: 0.20
 
+    max_fun : int, optional, default 15000
+        Only used when solver='lbfgs'. Maximum number of loss function calls.
+        The solver iterates until convergence (determined by 'tol'), number
+        of iterations reaches max_iter, or this number of loss function calls.
+        Note that number of loss function calls will be greater than or equal
+        to the number of iterations for the `MLPClassifier`.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     classes_ : array or list of array of shape (n_classes,)
@@ -898,8 +911,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
                  verbose=False, warm_start=False, momentum=0.9,
                  nesterovs_momentum=True, early_stopping=False,
                  validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
-                 epsilon=1e-8, n_iter_no_change=10):
-
+                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
         super().__init__(
             hidden_layer_sizes=hidden_layer_sizes,
             activation=activation, solver=solver, alpha=alpha,
@@ -912,7 +924,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
             early_stopping=early_stopping,
             validation_fraction=validation_fraction,
             beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
-            n_iter_no_change=n_iter_no_change)
+            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
 
     def _validate_input(self, X, y, incremental):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
@@ -1216,6 +1228,15 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
 
         .. versionadded:: 0.20
 
+    max_fun : int, optional, default 15000
+        Only used when solver='lbfgs'. Maximum number of function calls.
+        The solver iterates until convergence (determined by 'tol'), number
+        of iterations reaches max_iter, or this number of function calls.
+        Note that number of function calls will be greater than or equal to
+        the number of iterations for the MLPRegressor.
+
+        .. versionadded:: 0.22
+
     Attributes
     ----------
     loss_ : float
@@ -1279,8 +1300,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
                  verbose=False, warm_start=False, momentum=0.9,
                  nesterovs_momentum=True, early_stopping=False,
                  validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
-                 epsilon=1e-8, n_iter_no_change=10):
-
+                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
         super().__init__(
             hidden_layer_sizes=hidden_layer_sizes,
             activation=activation, solver=solver, alpha=alpha,
@@ -1293,7 +1313,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
             early_stopping=early_stopping,
             validation_fraction=validation_fraction,
             beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
-            n_iter_no_change=n_iter_no_change)
+            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
 
     def predict(self, X):
         """Predict using the multi-layer perceptron model.

diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py
@@ -49,6 +49,8 @@
 Xboston = StandardScaler().fit_transform(boston.data)[: 200]
 yboston = boston.target[:200]
 
+regression_datasets = [(Xboston, yboston)]
+
 iris = load_iris()
 
 X_iris = iris.data
@@ -229,32 +231,31 @@ def loss_grad_fun(t):
             assert_almost_equal(numgrad, grad)
 
 
-def test_lbfgs_classification():
+@pytest.mark.parametrize('X,y', classification_datasets)
+def test_lbfgs_classification(X, y):
     # Test lbfgs on classification.
     # It should achieve a score higher than 0.95 for the binary and multi-class
     # versions of the digits dataset.
-    for X, y in classification_datasets:
-        X_train = X[:150]
-        y_train = y[:150]
-        X_test = X[150:]
-
-        expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind)
-
-        for activation in ACTIVATION_TYPES:
-            mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50,
-                                max_iter=150, shuffle=True, random_state=1,
-                                activation=activation)
-            mlp.fit(X_train, y_train)
-            y_predict = mlp.predict(X_test)
-            assert_greater(mlp.score(X_train, y_train), 0.95)
-            assert_equal((y_predict.shape[0], y_predict.dtype.kind),
-                         expected_shape_dtype)
+    X_train = X[:150]
+    y_train = y[:150]
+    X_test = X[150:]
 
+    expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind)
 
-def test_lbfgs_regression():
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50,
+                            max_iter=150, shuffle=True, random_state=1,
+                            activation=activation)
+        mlp.fit(X_train, y_train)
+        y_predict = mlp.predict(X_test)
+        assert_greater(mlp.score(X_train, y_train), 0.95)
+        assert_equal((y_predict.shape[0], y_predict.dtype.kind),
+                     expected_shape_dtype)
+
+
+@pytest.mark.parametrize('X,y', regression_datasets)
+def test_lbfgs_regression(X, y):
     # Test lbfgs on the boston dataset, a regression problems.
-    X = Xboston
-    y = yboston
     for activation in ACTIVATION_TYPES:
         mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50,
                            max_iter=150, shuffle=True, random_state=1,
@@ -267,6 +268,37 @@ def test_lbfgs_regression():
             assert_greater(mlp.score(X, y), 0.95)
 
 
+@pytest.mark.parametrize('X,y', classification_datasets)
+def test_lbfgs_classification_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # classification tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50,
+                            max_iter=150, max_fun=max_fun, shuffle=True,
+                            random_state=1, activation=activation)
+        mlp.fit(X, y)
+        assert max_fun >= mlp.n_iter_
+
+
+@pytest.mark.parametrize('X,y', regression_datasets)
+def test_lbfgs_regression_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # regression tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50,
+                           max_iter=150, max_fun=max_fun, shuffle=True,
+                           random_state=1, activation=activation)
+        mlp.fit(X, y)
+        assert max_fun >= mlp.n_iter_
+
+    mlp.max_fun = -1
+    assert_raises(ValueError, mlp.fit, X, y)
+
+
 def test_learning_rate_warmstart():
     # Tests that warm_start reuse past solutions.
     X = [[3, 2], [1, 6], [5, 6], [-2, -4]]