scikit-learn · NicolasHug · Oct 10, 2019 · Oct 10, 2019 · Oct 10, 2019 · Oct 10, 2019
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -805,11 +805,10 @@ to shuffle the data indices before splitting them. Note that:
   validation performed by specifying ``cv=some_integer`` to
   :func:`cross_val_score`, grid search, etc. Keep in mind that
   :func:`train_test_split` still returns a random split.
-* The ``random_state`` parameter defaults to ``None``, meaning that the
-  shuffling will be different every time ``KFold(..., shuffle=True)`` is
-  iterated. However, ``GridSearchCV`` will use the same shuffling for each set
-  of parameters validated by a single call to its ``fit`` method.
-* To get identical results for each split, set ``random_state`` to an integer.
+* The ``random_state`` parameter defaults to ``None``, meaning that different
+  instances of e.g. ``KFold(..., shuffle=True)``) will give different splits.
+* No matter what the ``random_state`` is, calling ``split()`` multiple times
+  on the same instance will always yield the same splits.
 
 Cross validation and model selection
 ====================================

diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -45,6 +45,9 @@
            'check_cv']
 
 
+BIG_INT = 2**32  # Numpy seeds must be between 0 and 2**32 - 1
+
+
 class BaseCrossValidator(metaclass=ABCMeta):
     """Base class for all cross-validators
 
@@ -287,9 +290,19 @@ def __init__(self, n_splits, shuffle, random_state):
             raise TypeError("shuffle must be True or False;"
                             " got {0}".format(shuffle))
 
+        if not shuffle and random_state is not None:  # None is the default
+            raise ValueError(
+                'Setting a random_state will have no effect when shuffle is '
+                'False.'
+            )
+
         self.n_splits = n_splits
         self.shuffle = shuffle
-        self.random_state = random_state
+        self.random_state = random_state  # only here for repr to work
+        self.seed = random_state
+        if not isinstance(random_state, numbers.Integral):
+            self.seed = check_random_state(random_state).randint(BIG_INT)
+
 
     def split(self, X, y=None, groups=None):
         """Generate indices to split data into training and test set.
@@ -400,10 +413,6 @@ class KFold(_BaseKFold):
     ``n_samples // n_splits + 1``, other folds have size
     ``n_samples // n_splits``, where ``n_samples`` is the number of samples.
 
-    Randomized CV splitters may return different results for each call of
-    split. You can make the results identical by setting ``random_state``
-    to an integer.
-
     See also
     --------
     StratifiedKFold
@@ -424,7 +433,7 @@ def _iter_test_indices(self, X, y=None, groups=None):
         n_samples = _num_samples(X)
         indices = np.arange(n_samples)
         if self.shuffle:
-            check_random_state(self.random_state).shuffle(indices)
+            check_random_state(self.seed).shuffle(indices)
 
         n_splits = self.n_splits
         fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
@@ -625,7 +634,7 @@ def __init__(self, n_splits=5, shuffle=False, random_state=None):
         super().__init__(n_splits, shuffle, random_state)
 
     def _make_test_folds(self, X, y=None):
-        rng = check_random_state(self.random_state)
+        rng = check_random_state(self.seed)
         y = np.asarray(y)
         type_of_target_y = type_of_target(y)
         allowed_target_types = ('binary', 'multiclass')
@@ -709,12 +718,6 @@ def split(self, X, y, groups=None):
 
         test : ndarray
             The testing set indices for that split.
-
-        Notes
-        -----
-        Randomized CV splitters may return different results for each call of
-        split. You can make the results identical by setting ``random_state``
-        to an integer.
         """
         y = check_array(y, ensure_2d=False, dtype=None)
         return super().split(X, y, groups)
@@ -1100,8 +1103,11 @@ def __init__(self, cv, n_repeats=10, random_state=None, **cvargs):
 
         self.cv = cv
         self.n_repeats = n_repeats
-        self.random_state = random_state
         self.cvargs = cvargs
+        self.random_state = random_state  # only here for repr to work
+        self.seed = random_state
+        if not isinstance(random_state, numbers.Integral):
+            self.seed = check_random_state(random_state).randint(BIG_INT)
 
     def split(self, X, y=None, groups=None):
         """Generates indices to split data into training and test set.
@@ -1128,9 +1134,12 @@ def split(self, X, y=None, groups=None):
             The testing set indices for that split.
         """
         n_repeats = self.n_repeats
-        rng = check_random_state(self.random_state)
+        rng = check_random_state(self.seed)
 
-        for idx in range(n_repeats):
+        for _ in range(n_repeats):
+            # the local rng object will be consumed exactly once in __init__
+            # of self.cv, hence ensuring that each repeatition yields
+            # different folds.
             cv = self.cv(random_state=rng, shuffle=True,
                          **self.cvargs)
             for train_index, test_index in cv.split(X, y, groups):
@@ -1158,7 +1167,7 @@ def get_n_splits(self, X=None, y=None, groups=None):
         n_splits : int
             Returns the number of splitting iterations in the cross-validator.
         """
-        rng = check_random_state(self.random_state)
+        rng = check_random_state(self.seed)
         cv = self.cv(random_state=rng, shuffle=True,
                      **self.cvargs)
         return cv.get_n_splits(X, y, groups) * self.n_repeats
@@ -1205,12 +1214,6 @@ class RepeatedKFold(_RepeatedSplits):
     TRAIN: [1 2] TEST: [0 3]
     TRAIN: [0 3] TEST: [1 2]
 
-    Notes
-    -----
-    Randomized CV splitters may return different results for each call of
-    split. You can make the results identical by setting ``random_state``
-    to an integer.
-
     See also
     --------
     RepeatedStratifiedKFold: Repeats Stratified K-Fold n times.
@@ -1247,22 +1250,16 @@ class RepeatedStratifiedKFold(_RepeatedSplits):
     >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
     >>> y = np.array([0, 0, 1, 1])
     >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2,
-    ...     random_state=36851234)
+    ...     random_state=36851235)
     >>> for train_index, test_index in rskf.split(X, y):
     ...     print("TRAIN:", train_index, "TEST:", test_index)
     ...     X_train, X_test = X[train_index], X[test_index]
     ...     y_train, y_test = y[train_index], y[test_index]
     ...
-    TRAIN: [1 2] TEST: [0 3]
-    TRAIN: [0 3] TEST: [1 2]
-    TRAIN: [1 3] TEST: [0 2]
     TRAIN: [0 2] TEST: [1 3]
-
-    Notes
-    -----
-    Randomized CV splitters may return different results for each call of
-    split. You can make the results identical by setting ``random_state``
-    to an integer.
+    TRAIN: [1 3] TEST: [0 2]
+    TRAIN: [0 3] TEST: [1 2]
+    TRAIN: [1 2] TEST: [0 3]
 
     See also
     --------
@@ -1281,8 +1278,11 @@ def __init__(self, n_splits=10, test_size=None, train_size=None,
         self.n_splits = n_splits
         self.test_size = test_size
         self.train_size = train_size
-        self.random_state = random_state
         self._default_test_size = 0.1
+        self.random_state = random_state  # only here for repr to work
+        self.seed = random_state
+        if not isinstance(random_state, numbers.Integral):
+            self.seed = check_random_state(random_state).randint(BIG_INT)
 
     def split(self, X, y=None, groups=None):
         """Generate indices to split data into training and test set.
@@ -1425,7 +1425,7 @@ def _iter_indices(self, X, y=None, groups=None):
             n_samples, self.test_size, self.train_size,
             default_test_size=self._default_test_size)
 
-        rng = check_random_state(self.random_state)
+        rng = check_random_state(self.seed)
         for i in range(self.n_splits):
             # random partition
             permutation = rng.permutation(n_samples)
@@ -1664,7 +1664,7 @@ def _iter_indices(self, X, y, groups=None):
         class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
                                  np.cumsum(class_counts)[:-1])
 
-        rng = check_random_state(self.random_state)
+        rng = check_random_state(self.seed)
 
         for _ in range(self.n_splits):
             # if there are ties in the class-counts, we want

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -1220,7 +1220,7 @@ def test_search_cv_results_none_param():
     X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1]
     estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
     est_parameters = {"random_state": [0, None]}
-    cv = KFold(random_state=0)
+    cv = KFold()
 
     for est in estimators:
         grid_search = GridSearchCV(est, est_parameters, cv=cv,
@@ -1294,7 +1294,7 @@ def test_grid_search_correct_score_results():
 
 def test_fit_grid_point():
     X, y = make_classification(random_state=0)
-    cv = StratifiedKFold(random_state=0)
+    cv = StratifiedKFold()
     svc = LinearSVC(random_state=0)
     scorer = make_scorer(accuracy_score)
 
@@ -1345,7 +1345,7 @@ def test_grid_search_with_multioutput_data():
                                           random_state=0)
 
     est_parameters = {"max_depth": [1, 2, 3, 4]}
-    cv = KFold(random_state=0)
+    cv = KFold()
 
     estimators = [DecisionTreeRegressor(random_state=0),
                   DecisionTreeClassifier(random_state=0)]