Skip to content

Commit

Permalink
EHN Add parameter as_frame to fetch_covtype (#17491)
Browse files Browse the repository at this point in the history
Co-authored-by: aliang <aliang@air>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
  • Loading branch information
3 people committed Jun 21, 2020
1 parent c9c1a81 commit 6ef4487
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 6 deletions.
5 changes: 5 additions & 0 deletions doc/whats_new/v0.24.rst
Expand Up @@ -53,6 +53,11 @@ Changelog
unless data is sparse.
:pr:`17396` by :user:`Jiaxiang <fujiaxiang>`.

- |Enhancement| :func:`datasets.fetch_covtype` now now supports the optional
argument `as_frame`; when it is set to True, the returned Bunch object's
`data` and `frame` members are pandas DataFrames, and the `target` member is
a pandas Series. :pr:`17491` by :user:`Alex Liang <tianchuliang>`.

:mod:`sklearn.decomposition`
............................

Expand Down
50 changes: 48 additions & 2 deletions sklearn/datasets/_covtype.py
Expand Up @@ -23,6 +23,7 @@
import joblib

from . import get_data_home
from ._base import _convert_data_dataframe
from ._base import _fetch_remote
from ._base import RemoteFileMetadata
from ..utils import Bunch
Expand All @@ -41,10 +42,27 @@

logger = logging.getLogger(__name__)

# Column names reference:
# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
FEATURE_NAMES = ["Elevation",
"Aspect",
"Slope",
"Horizontal_Distance_To_Hydrology",
"Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am",
"Hillshade_Noon",
"Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)]
FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)]
TARGET_NAMES = ["Cover_Type"]


@_deprecate_positional_args
def fetch_covtype(*, data_home=None, download_if_missing=True,
random_state=None, shuffle=False, return_X_y=False):
random_state=None, shuffle=False, return_X_y=False,
as_frame=False):
"""Load the covertype dataset (classification).
Download it if necessary.
Expand Down Expand Up @@ -82,6 +100,15 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
.. versionadded:: 0.20
as_frame : bool, default=False
If True, the data is a pandas DataFrame including columns with
appropriate dtypes (numeric). The target is a pandas DataFrame or
Series depending on the number of target columns. If `return_X_y` is
True, then (`data`, `target`) will be pandas DataFrames or Series as
described below.
.. versionadded:: 0.24
Returns
-------
dataset : :class:`~sklearn.utils.Bunch`
Expand All @@ -93,12 +120,19 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
Each value corresponds to one of
the 7 forest covertypes with values
ranging between 1 to 7.
frame : dataframe of shape (581012, 53)
Only present when `as_frame=True`. Contains `data` and `target`.
DESCR : str
Description of the forest covertype dataset.
feature_names : list
The names of the dataset columns
target_names: list
The names of the target columns
(data, target) : tuple if ``return_X_y`` is True
.. versionadded:: 0.20
"""

data_home = get_data_home(data_home=data_home)
Expand Down Expand Up @@ -142,7 +176,19 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
with open(join(module_path, 'descr', 'covtype.rst')) as rst_file:
fdescr = rst_file.read()

frame = None
if as_frame:
frame, X, y = _convert_data_dataframe(caller_name="fetch_covtype",
data=X,
target=y,
feature_names=FEATURE_NAMES,
target_names=TARGET_NAMES)
if return_X_y:
return X, y

return Bunch(data=X, target=y, DESCR=fdescr)
return Bunch(data=X,
target=y,
frame=frame,
target_names=TARGET_NAMES,
feature_names=FEATURE_NAMES,
DESCR=fdescr)
6 changes: 4 additions & 2 deletions sklearn/datasets/descr/covtype.rst
Expand Up @@ -22,7 +22,9 @@ while others are discrete or continuous measurements.
================= ============

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like object
it returns a dictionary-like 'Bunch' object
with the feature matrix in the ``data`` member
and the target values in ``target``.
and the target values in ``target``. If optional argument 'as_frame' is
set to 'True', it will return ``data`` and ``target`` as pandas
data frame, and there will be an additional member ``frame`` as well.
The dataset will be downloaded from the web if necessary.
27 changes: 25 additions & 2 deletions sklearn/datasets/tests/test_covtype.py
@@ -1,9 +1,9 @@
"""Test the covtype loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""

from sklearn.datasets.tests.test_common import check_return_X_y
from functools import partial
import pytest
from sklearn.datasets.tests.test_common import check_return_X_y


def test_fetch(fetch_covtype_fxt):
Expand All @@ -23,3 +23,26 @@ def test_fetch(fetch_covtype_fxt):
# test return_X_y option
fetch_func = partial(fetch_covtype_fxt)
check_return_X_y(data1, fetch_func)


def test_fetch_asframe(fetch_covtype_fxt):
bunch = fetch_covtype_fxt(as_frame=True)
assert hasattr(bunch, 'frame')
frame = bunch.frame
assert frame.shape == (581012, 55)
assert bunch.data.shape == (581012, 54)
assert bunch.target.shape == (581012,)

column_names = set(frame.columns)

# enumerated names are added correctly
assert set(f"Wilderness_Area_{i}" for i in range(4)) < column_names
assert set(f"Soil_Type_{i}" for i in range(40)) < column_names


def test_pandas_dependency_message(fetch_covtype_fxt,
hide_available_pandas):
expected_msg = ('fetch_covtype with as_frame=True'
' requires pandas')
with pytest.raises(ImportError, match=expected_msg):
fetch_covtype_fxt(as_frame=True)

0 comments on commit 6ef4487

Please sign in to comment.