[MRG+1] Reference glossary in random_state docstring entries in datas…

…ets module (#10732) * Reference glossary in random_state docstring entries in datasets module * Add note about reproducibility to random_state docstring entries * Fix default value in random_state docstring entries * Revert fix of flake8 'line too long' error. Deemed off topic for this PR * Remove surrounding spaces from equals sign in docstrings * Change all '`random_state <Glossary>`' to '`Glossary <random_state>`'
scikit-learn · Mar 24, 2018 · 3b037b0 · 3b037b0
1 parent 58c043e
commit 3b037b0
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 134 deletions.
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
@@ -144,11 +144,10 @@ def load_files(container_path, description=None, categories=None,
         contains characters not of the given `encoding`. Passed as keyword
         argument 'errors' to bytes.decode.
 
-    random_state : int, RandomState instance or None, optional (default=0)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+    random_state : int, RandomState instance or None (default=0)
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     Returns
     -------

diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
@@ -57,12 +57,10 @@ def fetch_covtype(data_home=None, download_if_missing=True,
         If False, raise a IOError if the data is not locally available
         instead of trying to download the data from the source site.
 
-    random_state : int, RandomState instance or None, optional (default=None)
-        Random state for shuffling the dataset.
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     shuffle : bool, default=False
         Whether to shuffle dataset.

diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
@@ -139,14 +139,11 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
     shuffle : bool, default=False
         Whether to shuffle dataset.
 
-    random_state : int, RandomState instance or None, optional (default=None)
-        Random state for shuffling the dataset. If subset='SA', this random
-        state is also used to randomly select the small proportion of abnormal
-        samples.
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset shuffling and for
+        selection of abnormal samples if `subset='SA'`. Pass an int for
+        reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     percent10 : bool, default=True
         Whether to load only 10 percent of the data.

diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py
@@ -64,11 +64,10 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
         If True the order of the dataset is shuffled to avoid having
         images of the same person grouped.
 
-    random_state : int, RandomState instance or None, optional (default=0)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+    random_state : int, RandomState instance or None (default=0)
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     download_if_missing : optional, True by default
         If False, raise a IOError if the data is not locally available

diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py
@@ -102,12 +102,10 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
         If False, raise a IOError if the data is not locally available
         instead of trying to download the data from the source site.
 
-    random_state : int, RandomState instance or None, optional (default=None)
-        Random state for shuffling the dataset.
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset shuffling. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     shuffle : bool, default=False
         Whether to shuffle dataset.
@@ -182,7 +180,6 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
         X = joblib.load(samples_path)
         sample_id = joblib.load(sample_id_path)
 
-
     # load target (y), categories, and sample_id_bis
     if download_if_missing and (not exists(sample_topics_path) or
                                 not exists(topics_path)):