From 6ef4487c66cf82da1e231aabb8a398862e02b699 Mon Sep 17 00:00:00 2001 From: Alex Liang Date: Sun, 21 Jun 2020 01:02:26 -0400 Subject: [PATCH] EHN Add parameter as_frame to fetch_covtype (#17491) Co-authored-by: aliang Co-authored-by: Thomas J. Fan --- doc/whats_new/v0.24.rst | 5 +++ sklearn/datasets/_covtype.py | 50 ++++++++++++++++++++++++-- sklearn/datasets/descr/covtype.rst | 6 ++-- sklearn/datasets/tests/test_covtype.py | 27 ++++++++++++-- 4 files changed, 82 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 6982079525f72..a479edd25342b 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -53,6 +53,11 @@ Changelog unless data is sparse. :pr:`17396` by :user:`Jiaxiang `. +- |Enhancement| :func:`datasets.fetch_covtype` now now supports the optional + argument `as_frame`; when it is set to True, the returned Bunch object's + `data` and `frame` members are pandas DataFrames, and the `target` member is + a pandas Series. :pr:`17491` by :user:`Alex Liang `. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index beb6660193fbc..0f3093807a83a 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -23,6 +23,7 @@ import joblib from . import get_data_home +from ._base import _convert_data_dataframe from ._base import _fetch_remote from ._base import RemoteFileMetadata from ..utils import Bunch @@ -41,10 +42,27 @@ logger = logging.getLogger(__name__) +# Column names reference: +# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info +FEATURE_NAMES = ["Elevation", + "Aspect", + "Slope", + "Horizontal_Distance_To_Hydrology", + "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", + "Hillshade_9am", + "Hillshade_Noon", + "Hillshade_3pm", + "Horizontal_Distance_To_Fire_Points"] +FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)] +FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)] +TARGET_NAMES = ["Cover_Type"] + @_deprecate_positional_args def fetch_covtype(*, data_home=None, download_if_missing=True, - random_state=None, shuffle=False, return_X_y=False): + random_state=None, shuffle=False, return_X_y=False, + as_frame=False): """Load the covertype dataset (classification). Download it if necessary. @@ -82,6 +100,15 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, .. versionadded:: 0.20 + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is a pandas DataFrame or + Series depending on the number of target columns. If `return_X_y` is + True, then (`data`, `target`) will be pandas DataFrames or Series as + described below. + + .. versionadded:: 0.24 + Returns ------- dataset : :class:`~sklearn.utils.Bunch` @@ -93,12 +120,19 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, Each value corresponds to one of the 7 forest covertypes with values ranging between 1 to 7. + frame : dataframe of shape (581012, 53) + Only present when `as_frame=True`. Contains `data` and `target`. DESCR : str Description of the forest covertype dataset. + feature_names : list + The names of the dataset columns + target_names: list + The names of the target columns (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 + """ data_home = get_data_home(data_home=data_home) @@ -142,7 +176,19 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, with open(join(module_path, 'descr', 'covtype.rst')) as rst_file: fdescr = rst_file.read() + frame = None + if as_frame: + frame, X, y = _convert_data_dataframe(caller_name="fetch_covtype", + data=X, + target=y, + feature_names=FEATURE_NAMES, + target_names=TARGET_NAMES) if return_X_y: return X, y - return Bunch(data=X, target=y, DESCR=fdescr) + return Bunch(data=X, + target=y, + frame=frame, + target_names=TARGET_NAMES, + feature_names=FEATURE_NAMES, + DESCR=fdescr) diff --git a/sklearn/datasets/descr/covtype.rst b/sklearn/datasets/descr/covtype.rst index 4e79b5b89b9a1..0090b8e4a6b7d 100644 --- a/sklearn/datasets/descr/covtype.rst +++ b/sklearn/datasets/descr/covtype.rst @@ -22,7 +22,9 @@ while others are discrete or continuous measurements. ================= ============ :func:`sklearn.datasets.fetch_covtype` will load the covertype dataset; -it returns a dictionary-like object +it returns a dictionary-like 'Bunch' object with the feature matrix in the ``data`` member -and the target values in ``target``. +and the target values in ``target``. If optional argument 'as_frame' is +set to 'True', it will return ``data`` and ``target`` as pandas +data frame, and there will be an additional member ``frame`` as well. The dataset will be downloaded from the web if necessary. diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index d966e6c3890d0..df0989d66bb3a 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -1,9 +1,9 @@ """Test the covtype loader, if the data is available, or if specifically requested via environment variable (e.g. for travis cron job).""" - -from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial +import pytest +from sklearn.datasets.tests.test_common import check_return_X_y def test_fetch(fetch_covtype_fxt): @@ -23,3 +23,26 @@ def test_fetch(fetch_covtype_fxt): # test return_X_y option fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func) + + +def test_fetch_asframe(fetch_covtype_fxt): + bunch = fetch_covtype_fxt(as_frame=True) + assert hasattr(bunch, 'frame') + frame = bunch.frame + assert frame.shape == (581012, 55) + assert bunch.data.shape == (581012, 54) + assert bunch.target.shape == (581012,) + + column_names = set(frame.columns) + + # enumerated names are added correctly + assert set(f"Wilderness_Area_{i}" for i in range(4)) < column_names + assert set(f"Soil_Type_{i}" for i in range(40)) < column_names + + +def test_pandas_dependency_message(fetch_covtype_fxt, + hide_available_pandas): + expected_msg = ('fetch_covtype with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_covtype_fxt(as_frame=True)