EHN Add parameter as_frame to fetch_covtype (#17491)

Co-authored-by: aliang <aliang@air> Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
scikit-learn · Jun 21, 2020 · 6ef4487 · 6ef4487
1 parent c9c1a81
commit 6ef4487
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 6 deletions.
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -53,6 +53,11 @@ Changelog
   unless data is sparse.
   :pr:`17396` by :user:`Jiaxiang <fujiaxiang>`.
 
+- |Enhancement| :func:`datasets.fetch_covtype` now now supports the optional
+  argument `as_frame`; when it is set to True, the returned Bunch object's
+  `data` and `frame` members are pandas DataFrames, and the `target` member is
+  a pandas Series. :pr:`17491` by :user:`Alex Liang <tianchuliang>`.
+
 :mod:`sklearn.decomposition`
 ............................
 

diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py
@@ -23,6 +23,7 @@
 import joblib
 
 from . import get_data_home
+from ._base import _convert_data_dataframe
 from ._base import _fetch_remote
 from ._base import RemoteFileMetadata
 from ..utils import Bunch
@@ -41,10 +42,27 @@
 
 logger = logging.getLogger(__name__)
 
+# Column names reference:
+# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
+FEATURE_NAMES = ["Elevation",
+                 "Aspect",
+                 "Slope",
+                 "Horizontal_Distance_To_Hydrology",
+                 "Vertical_Distance_To_Hydrology",
+                 "Horizontal_Distance_To_Roadways",
+                 "Hillshade_9am",
+                 "Hillshade_Noon",
+                 "Hillshade_3pm",
+                 "Horizontal_Distance_To_Fire_Points"]
+FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)]
+FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)]
+TARGET_NAMES = ["Cover_Type"]
+
 
 @_deprecate_positional_args
 def fetch_covtype(*, data_home=None, download_if_missing=True,
-                  random_state=None, shuffle=False, return_X_y=False):
+                  random_state=None, shuffle=False, return_X_y=False,
+                  as_frame=False):
     """Load the covertype dataset (classification).
 
     Download it if necessary.
@@ -82,6 +100,15 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
 
         .. versionadded:: 0.20
 
+    as_frame : bool, default=False
+        If True, the data is a pandas DataFrame including columns with
+        appropriate dtypes (numeric). The target is a pandas DataFrame or
+        Series depending on the number of target columns. If `return_X_y` is
+        True, then (`data`, `target`) will be pandas DataFrames or Series as
+        described below.
+
+        .. versionadded:: 0.24
+
     Returns
     -------
     dataset : :class:`~sklearn.utils.Bunch`
@@ -93,12 +120,19 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
             Each value corresponds to one of
             the 7 forest covertypes with values
             ranging between 1 to 7.
+        frame : dataframe of shape (581012, 53)
+            Only present when `as_frame=True`. Contains `data` and `target`.
         DESCR : str
             Description of the forest covertype dataset.
+        feature_names : list
+            The names of the dataset columns
+        target_names: list
+            The names of the target columns
 
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.20
+
     """
 
     data_home = get_data_home(data_home=data_home)
@@ -142,7 +176,19 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
     with open(join(module_path, 'descr', 'covtype.rst')) as rst_file:
         fdescr = rst_file.read()
 
+    frame = None
+    if as_frame:
+        frame, X, y = _convert_data_dataframe(caller_name="fetch_covtype",
+                                              data=X,
+                                              target=y,
+                                              feature_names=FEATURE_NAMES,
+                                              target_names=TARGET_NAMES)
     if return_X_y:
         return X, y
 
-    return Bunch(data=X, target=y, DESCR=fdescr)
+    return Bunch(data=X,
+                 target=y,
+                 frame=frame,
+                 target_names=TARGET_NAMES,
+                 feature_names=FEATURE_NAMES,
+                 DESCR=fdescr)
diff --git a/sklearn/datasets/descr/covtype.rst b/sklearn/datasets/descr/covtype.rst
@@ -22,7 +22,9 @@ while others are discrete or continuous measurements.
     =================   ============
 
 :func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
-it returns a dictionary-like object
+it returns a dictionary-like 'Bunch' object
 with the feature matrix in the ``data`` member
-and the target values in ``target``.
+and the target values in ``target``. If optional argument 'as_frame' is
+set to 'True', it will return ``data`` and ``target`` as pandas
+data frame, and there will be an additional member ``frame`` as well.
 The dataset will be downloaded from the web if necessary.
diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py
@@ -1,9 +1,9 @@
 """Test the covtype loader, if the data is available,
 or if specifically requested via environment variable
 (e.g. for travis cron job)."""
-
-from sklearn.datasets.tests.test_common import check_return_X_y
 from functools import partial
+import pytest
+from sklearn.datasets.tests.test_common import check_return_X_y
 
 
 def test_fetch(fetch_covtype_fxt):
@@ -23,3 +23,26 @@ def test_fetch(fetch_covtype_fxt):
     # test return_X_y option
     fetch_func = partial(fetch_covtype_fxt)
     check_return_X_y(data1, fetch_func)
+
+
+def test_fetch_asframe(fetch_covtype_fxt):
+    bunch = fetch_covtype_fxt(as_frame=True)
+    assert hasattr(bunch, 'frame')
+    frame = bunch.frame
+    assert frame.shape == (581012, 55)
+    assert bunch.data.shape == (581012, 54)
+    assert bunch.target.shape == (581012,)
+
+    column_names = set(frame.columns)
+
+    # enumerated names are added correctly
+    assert set(f"Wilderness_Area_{i}" for i in range(4)) < column_names
+    assert set(f"Soil_Type_{i}" for i in range(40)) < column_names
+
+
+def test_pandas_dependency_message(fetch_covtype_fxt,
+                                   hide_available_pandas):
+    expected_msg = ('fetch_covtype with as_frame=True'
+                    ' requires pandas')
+    with pytest.raises(ImportError, match=expected_msg):
+        fetch_covtype_fxt(as_frame=True)