scikit-learn · amueller · Oct 19, 2016 · Sep 28, 2016 · Sep 28, 2016 · Sep 28, 2016
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
@@ -642,7 +642,8 @@ def _shuffle(y, groups, random_state):
 def learning_curve(estimator, X, y, groups=None,
                    train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None,
                    exploit_incremental_learning=False, n_jobs=1,
-                   pre_dispatch="all", verbose=0):
+                   pre_dispatch="all", verbose=0, shuffle=False,
+                   random_state=None):
     """Learning curve.
 
     Determines cross-validated training and test scores for different training
@@ -718,7 +719,14 @@ def learning_curve(estimator, X, y, groups=None,
     verbose : integer, optional
         Controls the verbosity: the higher, the more messages.
 
-    Returns
+    shuffle : boolean, optional
+        Whether to shuffle training data before using it based on
+        `train_sizes`
+
+    random_state : None, int or RandomState
+        When shuffle=True, pseudo-random number generator state used for
+        shuffling. If None, use default numpy RNG for shuffling.
+
     -------
     train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
         Numbers of training examples that has been used to generate the
@@ -759,17 +767,26 @@ def learning_curve(estimator, X, y, groups=None,
 
     parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                         verbose=verbose)
+
+    rng = check_random_state(random_state) if shuffle else None
+
     if exploit_incremental_learning:
         classes = np.unique(y) if is_classifier(estimator) else None
         out = parallel(delayed(_incremental_fit_estimator)(
-            clone(estimator), X, y, classes, train, test, train_sizes_abs,
-            scorer, verbose) for train, test in cv.split(X, y, groups))
+            clone(estimator), X, y, classes, _get_train_indices(train, rng),
+            test, train_sizes_abs, scorer, verbose)
+            for train, test in cv_iter)
     else:
+        train_test_proportions = []
+        for train, test in cv_iter:
+            train = _get_train_indices(train, rng)
+            for n_train_samples in train_sizes_abs:
+                train_test_proportions.append((train[:n_train_samples], test))
+
         out = parallel(delayed(_fit_and_score)(
-            clone(estimator), X, y, scorer, train[:n_train_samples], test,
+            clone(estimator), X, y, scorer, train, test,
             verbose, parameters=None, fit_params=None, return_train_score=True)
-            for train, test in cv_iter
-            for n_train_samples in train_sizes_abs)
+            for train, test in train_test_proportions)
         out = np.array(out)
         n_cv_folds = out.shape[0] // n_unique_ticks
         out = out.reshape(n_cv_folds, n_unique_ticks, 2)
@@ -779,6 +796,13 @@ def learning_curve(estimator, X, y, groups=None,
     return train_sizes_abs, out[0], out[1]
 
 
+def _get_train_indices(train, rng):
+    """Shuffle training indices if random number generator is specified."""
+    if rng is not None:
+        train = rng.permutation(train)
+    return train
+
+
 def _translate_train_sizes(train_sizes, n_max_training_samples):
     """Determine absolute sizes of training subsets and validate 'train_sizes'.
 

diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
@@ -670,6 +670,31 @@ def test_learning_curve_batch_and_incremental_learning_are_equal():
                               test_scores_batch.mean(axis=1))
 
 
+def test_learning_curve_batch_and_incremental_shuffle():
+    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
+                               n_redundant=0, n_classes=2,
+                               n_clusters_per_class=1, random_state=0)
+    estimator = MockIncrementalImprovingEstimator(20)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator, X, y, cv=3, exploit_incremental_learning=True,
+        train_sizes=np.linspace(0.1, 1.0, 10),
+        random_state=1, shuffle=True)
+    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
+    assert_array_almost_equal(train_scores.mean(axis=1),
+                              np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1),
+                              np.linspace(0.1, 1.0, 10))
+
+    estimator = MockImprovingEstimator(20)
+    train_sizes, train_scores, test_scores = learning_curve(
+        estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10),
+        random_state=1, shuffle=True)
+    assert_array_almost_equal(train_scores.mean(axis=1),
+                              np.linspace(1.9, 1.0, 10))
+    assert_array_almost_equal(test_scores.mean(axis=1),
+                              np.linspace(0.1, 1.0, 10))
+
+
 def test_learning_curve_n_sample_range_out_of_bounds():
     X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                                n_redundant=0, n_classes=2,