DOC Clean up datasets loaders as part of the reorganization of the da…

…taset section (#11319) Standardize the datasets informations, as part of a more general reorganization of the dataset section in user guide, see #11083. Fixes #10555
scikit-learn · Jul 25, 2018 · 9d649c5 · 9d649c5
1 parent cf897de
commit 9d649c5
Show file tree

Hide file tree

Showing 18 changed files with 453 additions and 330 deletions.
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
@@ -130,6 +130,7 @@ They can be loaded using the following functions:
    fetch_covtype
    fetch_rcv1
    fetch_kddcup99
+   fetch_california_housing
 
 .. toctree::
     :maxdepth: 2
@@ -141,18 +142,21 @@ They can be loaded using the following functions:
     covtype
     rcv1
     kddcup99
+    california_housing
 
-.. include:: ./olivetti_faces.rst
+.. include:: ../../sklearn/datasets/descr/olivetti_faces.rst
 
-.. include:: ./twenty_newsgroups.rst
+.. include:: ../../sklearn/datasets/descr/twenty_newsgroups.rst
 
-.. include:: ./labeled_faces.rst
+.. include:: ../../sklearn/datasets/descr/lfw.rst
 
-.. include:: ./covtype.rst
+.. include:: ../../sklearn/datasets/descr/covtype.rst
 
-.. include:: ./rcv1.rst
+.. include:: ../../sklearn/datasets/descr/rcv1.rst
 
-.. include:: ./kddcup99.rst
+.. include:: ../../sklearn/datasets/descr/kddcup99.rst
+
+.. include:: ../../sklearn/datasets/descr/california_housing.rst
 
 .. _sample_generators:
 

diff --git a/doc/datasets/kddcup99.rst b/doc/datasets/kddcup99.rst
diff --git a/doc/datasets/rcv1.rst b/doc/datasets/rcv1.rst
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
@@ -264,7 +264,7 @@ def load_wine(return_X_y=False):
     Features            real, positive
     =================   ==============
 
-    Read more in the :ref:`User Guide <datasets>`.
+    Read more in the :ref:`User Guide <wine_dataset>`.
 
     Parameters
     ----------
@@ -339,7 +339,7 @@ def load_iris(return_X_y=False):
     Features            real, positive
     =================   ==============
 
-    Read more in the :ref:`User Guide <datasets>`.
+    Read more in the :ref:`User Guide <iris_dataset>`.
 
     Parameters
     ----------
@@ -414,6 +414,8 @@ def load_breast_cancer(return_X_y=False):
     Features            real, positive
     =================   ==============
 
+    Read more in the :ref:`User Guide <breast_cancer_dataset>`.
+
     Parameters
     ----------
     return_X_y : boolean, default=False
@@ -498,7 +500,7 @@ def load_digits(n_class=10, return_X_y=False):
     Features             integers 0-16
     =================   ==============
 
-    Read more in the :ref:`User Guide <datasets>`.
+    Read more in the :ref:`User Guide <digits_dataset>`.
 
     Parameters
     ----------
@@ -575,7 +577,7 @@ def load_diabetes(return_X_y=False):
     Targets             integer 25 - 346
     ==============      ==================
 
-    Read more in the :ref:`User Guide <datasets>`.
+    Read more in the :ref:`User Guide <diabetes_dataset>`.
 
     Parameters
     ----------
@@ -628,6 +630,8 @@ def load_linnerud(return_X_y=False):
     Targets           integer
     ==============    ============================
 
+    Read more in the :ref:`User Guide <linnerrud_dataset>`.
+
     Parameters
     ----------
     return_X_y : boolean, default=False.
@@ -690,6 +694,8 @@ def load_boston(return_X_y=False):
     Targets             real 5. - 50.
     ==============     ==============
 
+    Read more in the :ref:`User Guide <boston_dataset>`.
+
     Parameters
     ----------
     return_X_y : boolean, default=False.
@@ -760,6 +766,8 @@ def load_sample_images():
 
     Loads both, ``china`` and ``flower``.
 
+    Read more in the :ref:`User Guide <sample_images>`.
+
     Returns
     -------
     data : Bunch
@@ -801,6 +809,8 @@ def load_sample_images():
 def load_sample_image(image_name):
     """Load the numpy array of a single sample image
 
+    Read more in the :ref:`User Guide <sample_images>`.
+
     Parameters
     -----------
     image_name : {`china.jpg`, `flower.jpg`}

diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
@@ -21,7 +21,7 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from os.path import exists
+from os.path import dirname, exists, join
 from os import makedirs, remove
 import tarfile
 
@@ -43,18 +43,21 @@
     checksum=('aaa5c9a6afe2225cc2aed2723682ae40'
               '3280c4a3695a2ddda4ffb5d8215ea681'))
 
-# Grab the module-level docstring to use as a description of the
-# dataset
-MODULE_DOCS = __doc__
-
 logger = logging.getLogger(__name__)
 
 
 def fetch_california_housing(data_home=None, download_if_missing=True,
                              return_X_y=False):
-    """Loader for the California housing dataset from StatLib.
+    """Load the California housing dataset (regression).
+
+    ==============     ==============
+    Samples total               20640
+    Dimensionality                  8
+    Features                     real
+    Target             real 0.15 - 5.
+    ==============     ==============
 
-    Read more in the :ref:`User Guide <datasets>`.
+    Read more in the :ref:`User Guide <california_housing_dataset>`.
 
     Parameters
     ----------
@@ -144,10 +147,14 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
     # target in units of 100,000
     target = target / 100000.0
 
+    module_path = dirname(__file__)
+    with open(join(module_path, 'descr', 'california_housing.rst')) as dfile:
+        descr = dfile.read()
+
     if return_X_y:
         return data, target
 
     return Bunch(data=data,
                  target=target,
                  feature_names=feature_names,
-                 DESCR=MODULE_DOCS)
+                 DESCR=descr)
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
@@ -16,7 +16,7 @@
 
 from gzip import GzipFile
 import logging
-from os.path import exists, join
+from os.path import dirname, exists, join
 from os import remove
 
 import numpy as np
@@ -43,9 +43,18 @@
 
 def fetch_covtype(data_home=None, download_if_missing=True,
                   random_state=None, shuffle=False, return_X_y=False):
-    """Load the covertype dataset, downloading it if necessary.
+    """Load the covertype dataset (classification).
 
-    Read more in the :ref:`User Guide <datasets>`.
+    Download it if necessary.
+
+    =================   ============
+    Classes                        7
+    Samples total             581012
+    Dimensionality                54
+    Features                     int
+    =================   ============
+
+    Read more in the :ref:`User Guide <covtype_dataset>`.
 
     Parameters
     ----------
@@ -127,7 +136,11 @@ def fetch_covtype(data_home=None, download_if_missing=True,
         X = X[ind]
         y = y[ind]
 
+    module_path = dirname(__file__)
+    with open(join(module_path, 'descr', 'covtype.rst')) as rst_file:
+        fdescr = rst_file.read()
+
     if return_X_y:
         return X, y
 
-    return Bunch(data=X, target=y, DESCR=__doc__)
+    return Bunch(data=X, target=y, DESCR=fdescr)
diff --git a/sklearn/datasets/descr/california_housing.rst b/sklearn/datasets/descr/california_housing.rst
@@ -0,0 +1,40 @@
+.. _california_housing_dataset:
+
+California Housing dataset
+--------------------------
+
+**Data Set Characteristics:**
+
+    :Number of Instances: 20640
+
+    :Number of Attributes: 8 numeric, predictive attributes and the target
+
+    :Attribute Information:
+        - MedInc        median income in block
+        - HouseAge      median house age in block
+        - AveRooms      average number of rooms
+        - AveBedrms     average number of bedrooms
+        - Population    block population
+        - AveOccup      average house occupancy
+        - Latitude      house block latitude
+        - Longitude     house block longitude
+
+    :Missing Attribute Values: None
+
+This dataset was obtained from the StatLib repository.
+http://lib.stat.cmu.edu/datasets/
+
+The target variable is the median house value for California districts.
+
+This dataset was derived from the 1990 U.S. census, using one row per census
+block group. A block group is the smallest geographical unit for which the U.S.
+Census Bureau publishes sample data (a block group typically has a population
+of 600 to 3,000 people).
+
+It can be downloaded/loaded using the
+:func:`sklearn.datasets.fetch_california_housing` function.
+
+.. topic:: References
+
+    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
+      Statistics and Probability Letters, 33 (1997) 291-297
diff --git a/doc/datasets/covtype.rst → sklearn/datasets/descr/covtype.rst b/doc/datasets/covtype.rst → sklearn/datasets/descr/covtype.rst
@@ -1,4 +1,4 @@
-.. _covtype:
+.. _covtype_dataset:
 
 Forest covertypes
 -----------------
@@ -12,6 +12,15 @@ Each sample has 54 features, described on the
 Some of the features are boolean indicators,
 while others are discrete or continuous measurements.
 
+**Data Set Characteristics:**
+
+    =================   ============
+    Classes                        7
+    Samples total             581012
+    Dimensionality                54
+    Features                     int
+    =================   ============
+
 :func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
 it returns a dictionary-like object
 with the feature matrix in the ``data`` member