From 322ea45af919b6496ecd3dbcf25b27a70723c337 Mon Sep 17 00:00:00 2001 From: aliang Date: Sat, 6 Jun 2020 14:12:06 -0400 Subject: [PATCH 01/16] auto flake8 stuff --- sklearn/datasets/_covtype.py | 36 +++++++++++++++++++++++++- sklearn/datasets/tests/test_covtype.py | 18 ++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index de93b22ac4f56..d5708ab046e96 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -23,6 +23,7 @@ import joblib from . import get_data_home +from ._base import _convert_data_dataframe from ._base import _fetch_remote from ._base import RemoteFileMetadata from ..utils import Bunch @@ -44,7 +45,8 @@ @_deprecate_positional_args def fetch_covtype(*, data_home=None, download_if_missing=True, - random_state=None, shuffle=False, return_X_y=False): + random_state=None, shuffle=False, return_X_y=False, + as_frame=False): """Load the covertype dataset (classification). Download it if necessary. @@ -80,6 +82,8 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, If True, returns ``(data.data, data.target)`` instead of a Bunch object. + as_frame : boolean, default=False. + If True, returns ``pandas.DataFrame`` instead of a Bunch object .. versionadded:: 0.20 Returns @@ -98,6 +102,9 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, (data, target) : tuple if ``return_X_y`` is True + dataframe: :class: `pandas.DataFrame` + Pandas dataframe + .. versionadded:: 0.20 """ @@ -145,4 +152,31 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, if return_X_y: return X, y + if as_frame: + """ + Column names reference: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info + """ + feat_cols = ["Elevation", + "Aspect", + "Slope", + "Horizontal_Distance_To_Hydrology", + "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", + "Hillshade_9am", + "Hillshade_Noon", + "Hillshade_3pm", + "Horizontal_Distance_To_Fire_Points"] + feat_cols += ['Wilderness_Area_'+str(i) for i in range(1,5)] + feat_cols += ['Soil_Type_'+str(i) for i in range(1,41)] + target_col = ["Cover_Type"] + + frame, X, y = _convert_data_dataframe("fetch_covtype", X, y, + feat_cols, target_col) + return Bunch(data=X, + target=y, + frame=frame, + target_names=target_col, + feature_names=feat_cols, + DESCR=fdescr) + return Bunch(data=X, target=y, DESCR=fdescr) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index d966e6c3890d0..d66fff0940942 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -1,7 +1,7 @@ """Test the covtype loader, if the data is available, or if specifically requested via environment variable (e.g. for travis cron job).""" - +import pytest from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial @@ -23,3 +23,19 @@ def test_fetch(fetch_covtype_fxt): # test return_X_y option fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func) + +def test_fetch_asframe(fetch_covtype_fxt): + pd = pytest.importorskip('pandas') + bunch = fetch_covtype_fxt(as_frame=True) + frame = bunch.frame + assert hasattr(bunch, frame) is True + assert frame.shape == (581012, 55) + assert isinstance(bunch.data, pd.DataFrame) + assert isinstance(bunch.target, pd.Series) + +def test_pandas_dependency_message(fetch_covtype_fxt, + hide_available_pandas): + expected_msg = ('fetch_covtype_fxt with as_frame=True' + ' requires pandas') + with pytest.raises(ImportError, match=expected_msg): + fetch_covtype_fxt(as_frame=True) From c5f17eb7fd9e4b6dbbe30585f36ff68946fa6a4f Mon Sep 17 00:00:00 2001 From: aliang Date: Sat, 6 Jun 2020 14:26:15 -0400 Subject: [PATCH 02/16] tests passed --- sklearn/datasets/tests/test_covtype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index d66fff0940942..246482507230c 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -35,7 +35,7 @@ def test_fetch_asframe(fetch_covtype_fxt): def test_pandas_dependency_message(fetch_covtype_fxt, hide_available_pandas): - expected_msg = ('fetch_covtype_fxt with as_frame=True' + expected_msg = ('fetch_covtype with as_frame=True' ' requires pandas') with pytest.raises(ImportError, match=expected_msg): fetch_covtype_fxt(as_frame=True) From 9ab6f5c1ab4291c2fc15aaa2a05041b7eff70528 Mon Sep 17 00:00:00 2001 From: aliang Date: Sat, 6 Jun 2020 14:48:06 -0400 Subject: [PATCH 03/16] adjust lineendings --- sklearn/datasets/_covtype.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index d5708ab046e96..58914bdbe66dd 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -157,15 +157,15 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, Column names reference: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info """ feat_cols = ["Elevation", - "Aspect", - "Slope", - "Horizontal_Distance_To_Hydrology", - "Vertical_Distance_To_Hydrology", - "Horizontal_Distance_To_Roadways", - "Hillshade_9am", - "Hillshade_Noon", - "Hillshade_3pm", - "Horizontal_Distance_To_Fire_Points"] + "Aspect", + "Slope", + "Horizontal_Distance_To_Hydrology", + "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", + "Hillshade_9am", + "Hillshade_Noon", + "Hillshade_3pm", + "Horizontal_Distance_To_Fire_Points"] feat_cols += ['Wilderness_Area_'+str(i) for i in range(1,5)] feat_cols += ['Soil_Type_'+str(i) for i in range(1,41)] target_col = ["Cover_Type"] From 24df8fa8f314c2327b96255dea0850d6703a094d Mon Sep 17 00:00:00 2001 From: aliang Date: Sat, 6 Jun 2020 15:25:49 -0400 Subject: [PATCH 04/16] fixing flake8 linting issue --- sklearn/datasets/_covtype.py | 7 ++++--- sklearn/datasets/tests/test_covtype.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 58914bdbe66dd..1bcf5308474c4 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -154,7 +154,8 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, if as_frame: """ - Column names reference: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info + Column names reference: + https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info """ feat_cols = ["Elevation", "Aspect", @@ -166,8 +167,8 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points"] - feat_cols += ['Wilderness_Area_'+str(i) for i in range(1,5)] - feat_cols += ['Soil_Type_'+str(i) for i in range(1,41)] + feat_cols += ['Wilderness_Area_'+str(i) for i in range(1, 5)] + feat_cols += ['Soil_Type_'+str(i) for i in range(1, 41)] target_col = ["Cover_Type"] frame, X, y = _convert_data_dataframe("fetch_covtype", X, y, diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index 246482507230c..c0b83d2bb691e 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -24,6 +24,7 @@ def test_fetch(fetch_covtype_fxt): fetch_func = partial(fetch_covtype_fxt) check_return_X_y(data1, fetch_func) + def test_fetch_asframe(fetch_covtype_fxt): pd = pytest.importorskip('pandas') bunch = fetch_covtype_fxt(as_frame=True) @@ -33,6 +34,7 @@ def test_fetch_asframe(fetch_covtype_fxt): assert isinstance(bunch.data, pd.DataFrame) assert isinstance(bunch.target, pd.Series) + def test_pandas_dependency_message(fetch_covtype_fxt, hide_available_pandas): expected_msg = ('fetch_covtype with as_frame=True' From 5a34ebf48b0146072e647c0600ed99452789d866 Mon Sep 17 00:00:00 2001 From: aliang Date: Mon, 8 Jun 2020 20:17:10 -0400 Subject: [PATCH 05/16] adjusted API --- sklearn/datasets/_covtype.py | 48 +++++++++++++++----------- sklearn/datasets/tests/test_covtype.py | 1 + 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 1bcf5308474c4..00d4afbad6c76 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -82,8 +82,12 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, If True, returns ``(data.data, data.target)`` instead of a Bunch object. - as_frame : boolean, default=False. - If True, returns ``pandas.DataFrame`` instead of a Bunch object + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is + a pandas DataFrame or Series depending on the number of target columns. + If `return_X_y` is True, then (`data`, `target`) will be pandas + DataFrames or Series as described below. .. versionadded:: 0.20 Returns @@ -149,35 +153,37 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, with open(join(module_path, 'descr', 'covtype.rst')) as rst_file: fdescr = rst_file.read() - if return_X_y: - return X, y - - if as_frame: + if as_frame or return_X_y: """ Column names reference: https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info """ feat_cols = ["Elevation", - "Aspect", - "Slope", - "Horizontal_Distance_To_Hydrology", - "Vertical_Distance_To_Hydrology", - "Horizontal_Distance_To_Roadways", - "Hillshade_9am", - "Hillshade_Noon", - "Hillshade_3pm", - "Horizontal_Distance_To_Fire_Points"] + "Aspect", + "Slope", + "Horizontal_Distance_To_Hydrology", + "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", + "Hillshade_9am", + "Hillshade_Noon", + "Hillshade_3pm", + "Horizontal_Distance_To_Fire_Points"] feat_cols += ['Wilderness_Area_'+str(i) for i in range(1, 5)] feat_cols += ['Soil_Type_'+str(i) for i in range(1, 41)] target_col = ["Cover_Type"] frame, X, y = _convert_data_dataframe("fetch_covtype", X, y, feat_cols, target_col) - return Bunch(data=X, - target=y, - frame=frame, - target_names=target_col, - feature_names=feat_cols, - DESCR=fdescr) + + if as_frame: + return Bunch(data=X, + target=y, + frame=frame, + target_names=target_col, + feature_names=feat_cols, + DESCR=fdescr) + + if return_X_y: + return X, y return Bunch(data=X, target=y, DESCR=fdescr) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index c0b83d2bb691e..ca26a4494b840 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -31,6 +31,7 @@ def test_fetch_asframe(fetch_covtype_fxt): frame = bunch.frame assert hasattr(bunch, frame) is True assert frame.shape == (581012, 55) + assert isinstance(bunch.frame, pd.DataFrame) assert isinstance(bunch.data, pd.DataFrame) assert isinstance(bunch.target, pd.Series) From 04feec5acabcf85a83e264256d021389d3b4f797 Mon Sep 17 00:00:00 2001 From: aliang Date: Mon, 15 Jun 2020 22:26:24 -0400 Subject: [PATCH 06/16] flake8 error --- sklearn/datasets/_covtype.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 00d4afbad6c76..ef4a08fbcee60 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -159,15 +159,15 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info """ feat_cols = ["Elevation", - "Aspect", - "Slope", - "Horizontal_Distance_To_Hydrology", - "Vertical_Distance_To_Hydrology", - "Horizontal_Distance_To_Roadways", - "Hillshade_9am", - "Hillshade_Noon", - "Hillshade_3pm", - "Horizontal_Distance_To_Fire_Points"] + "Aspect", + "Slope", + "Horizontal_Distance_To_Hydrology", + "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", + "Hillshade_9am", + "Hillshade_Noon", + "Hillshade_3pm", + "Horizontal_Distance_To_Fire_Points"] feat_cols += ['Wilderness_Area_'+str(i) for i in range(1, 5)] feat_cols += ['Soil_Type_'+str(i) for i in range(1, 41)] target_col = ["Cover_Type"] @@ -177,11 +177,11 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, if as_frame: return Bunch(data=X, - target=y, - frame=frame, - target_names=target_col, - feature_names=feat_cols, - DESCR=fdescr) + target=y, + frame=frame, + target_names=target_col, + feature_names=feat_cols, + DESCR=fdescr) if return_X_y: return X, y From f8ff4155ee7771bc26f81ccb96b67e537e394745 Mon Sep 17 00:00:00 2001 From: aliang Date: Fri, 19 Jun 2020 12:22:44 -0400 Subject: [PATCH 07/16] trailing white spaces --- sklearn/datasets/_covtype.py | 77 +++++++++++++------------- sklearn/datasets/descr/covtype.rst | 6 +- sklearn/datasets/tests/test_covtype.py | 10 +--- 3 files changed, 46 insertions(+), 47 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 5ceedf68334c8..88ef4d6651748 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -42,6 +42,24 @@ logger = logging.getLogger(__name__) +""" +Column names reference: +https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info +""" +FEATURE_NAMES = ["Elevation", + "Aspect", + "Slope", + "Horizontal_Distance_To_Hydrology", + "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", + "Hillshade_9am", + "Hillshade_Noon", + "Hillshade_3pm", + "Horizontal_Distance_To_Fire_Points"] +FEATURE_NAMES += ['Wilderness_Area_'+str(i) for i in range(1, 5)] +FEATURE_NAMES += ['Soil_Type_'+str(i) for i in range(1, 41)] +TARGET_NAMES = ["Cover_Type"] + @_deprecate_positional_args def fetch_covtype(*, data_home=None, download_if_missing=True, @@ -88,7 +106,7 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, a pandas DataFrame or Series depending on the number of target columns. If `return_X_y` is True, then (`data`, `target`) will be pandas DataFrames or Series as described below. - .. versionadded:: 0.20 + .. versionadded:: 0.24 Returns ------- @@ -101,15 +119,16 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, Each value corresponds to one of the 7 forest covertypes with values ranging between 1 to 7. + frame : dataframe of shape (581012, 53) + Only present when `as_frame=True`. Contains `data` and `target`. DESCR : str Description of the forest covertype dataset. (data, target) : tuple if ``return_X_y`` is True - dataframe: :class: `pandas.DataFrame` - Pandas dataframe - .. versionadded:: 0.20 + + .. versionadded:: 0.24 """ data_home = get_data_home(data_home=data_home) @@ -153,37 +172,19 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, with open(join(module_path, 'descr', 'covtype.rst')) as rst_file: fdescr = rst_file.read() - if as_frame or return_X_y: - """ - Column names reference: - https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info - """ - feat_cols = ["Elevation", - "Aspect", - "Slope", - "Horizontal_Distance_To_Hydrology", - "Vertical_Distance_To_Hydrology", - "Horizontal_Distance_To_Roadways", - "Hillshade_9am", - "Hillshade_Noon", - "Hillshade_3pm", - "Horizontal_Distance_To_Fire_Points"] - feat_cols += ['Wilderness_Area_'+str(i) for i in range(1, 5)] - feat_cols += ['Soil_Type_'+str(i) for i in range(1, 41)] - target_col = ["Cover_Type"] - - frame, X, y = _convert_data_dataframe("fetch_covtype", X, y, - feat_cols, target_col) - - if as_frame: - return Bunch(data=X, - target=y, - frame=frame, - target_names=target_col, - feature_names=feat_cols, - DESCR=fdescr) - - if return_X_y: - return X, y - - return Bunch(data=X, target=y, DESCR=fdescr) + frame = None + if as_frame: + frame, X, y = _convert_data_dataframe(caller_name="fetch_covtype", + data=X, + target=y, + feature_names=FEATURE_NAMES, + target_names=TARGET_NAMES) + if return_X_y: + return X, y + + return Bunch(data=X, + target=y, + frame=frame, + target_names=TARGET_NAMES, + feature_names=FEATURE_NAMES, + DESCR=fdescr) diff --git a/sklearn/datasets/descr/covtype.rst b/sklearn/datasets/descr/covtype.rst index 4e79b5b89b9a1..0090b8e4a6b7d 100644 --- a/sklearn/datasets/descr/covtype.rst +++ b/sklearn/datasets/descr/covtype.rst @@ -22,7 +22,9 @@ while others are discrete or continuous measurements. ================= ============ :func:`sklearn.datasets.fetch_covtype` will load the covertype dataset; -it returns a dictionary-like object +it returns a dictionary-like 'Bunch' object with the feature matrix in the ``data`` member -and the target values in ``target``. +and the target values in ``target``. If optional argument 'as_frame' is +set to 'True', it will return ``data`` and ``target`` as pandas +data frame, and there will be an additional member ``frame`` as well. The dataset will be downloaded from the web if necessary. diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index ca26a4494b840..cbac452104816 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -1,9 +1,9 @@ """Test the covtype loader, if the data is available, or if specifically requested via environment variable (e.g. for travis cron job).""" +from functools import partial import pytest from sklearn.datasets.tests.test_common import check_return_X_y -from functools import partial def test_fetch(fetch_covtype_fxt): @@ -25,15 +25,11 @@ def test_fetch(fetch_covtype_fxt): check_return_X_y(data1, fetch_func) -def test_fetch_asframe(fetch_covtype_fxt): - pd = pytest.importorskip('pandas') +def test_fetch_asframe_shape(fetch_covtype_fxt): bunch = fetch_covtype_fxt(as_frame=True) + assert hasattr(bunch, 'frame') frame = bunch.frame - assert hasattr(bunch, frame) is True assert frame.shape == (581012, 55) - assert isinstance(bunch.frame, pd.DataFrame) - assert isinstance(bunch.data, pd.DataFrame) - assert isinstance(bunch.target, pd.Series) def test_pandas_dependency_message(fetch_covtype_fxt, From 0134c73a66f0f86d5bbee42aa67a9ec0b99c9f42 Mon Sep 17 00:00:00 2001 From: aliang Date: Fri, 19 Jun 2020 16:12:16 -0400 Subject: [PATCH 08/16] [scipy-dev] --- sklearn/datasets/_covtype.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 88ef4d6651748..2e1489ae20130 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -127,7 +127,6 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, (data, target) : tuple if ``return_X_y`` is True - .. versionadded:: 0.24 """ From 7e06f8f6760d9019ca42f51df27042ad0d811e1f Mon Sep 17 00:00:00 2001 From: aliang Date: Sat, 20 Jun 2020 12:01:06 -0400 Subject: [PATCH 09/16] versionadded error fixes --- sklearn/datasets/_covtype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 2e1489ae20130..5d5fa77248c09 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -99,7 +99,7 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, return_X_y : bool, default=False If True, returns ``(data.data, data.target)`` instead of a Bunch object. - + .. versionadded:: 0.20 as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). The target is @@ -126,8 +126,8 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, (data, target) : tuple if ``return_X_y`` is True + .. versionadded:: 0.20 - .. versionadded:: 0.24 """ data_home = get_data_home(data_home=data_home) From 8fc854e5131e6f99a8c80c59cfa1992d57eade31 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 20 Jun 2020 12:32:01 -0400 Subject: [PATCH 10/16] ENH Update style and starting idx --- sklearn/datasets/_covtype.py | 17 ++++++++++------- sklearn/datasets/tests/test_covtype.py | 8 +++++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 5d5fa77248c09..422ba293dffaa 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -56,8 +56,8 @@ "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points"] -FEATURE_NAMES += ['Wilderness_Area_'+str(i) for i in range(1, 5)] -FEATURE_NAMES += ['Soil_Type_'+str(i) for i in range(1, 41)] +FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)] +FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)] TARGET_NAMES = ["Cover_Type"] @@ -99,13 +99,16 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, return_X_y : bool, default=False If True, returns ``(data.data, data.target)`` instead of a Bunch object. - .. versionadded:: 0.20 + + .. versionadded:: 0.20 + as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with - appropriate dtypes (numeric). The target is - a pandas DataFrame or Series depending on the number of target columns. - If `return_X_y` is True, then (`data`, `target`) will be pandas - DataFrames or Series as described below. + appropriate dtypes (numeric). The target is a pandas DataFrame or + Series depending on the number of target columns. If `return_X_y` is + True, then (`data`, `target`) will be pandas DataFrames or Series as + described below. + .. versionadded:: 0.24 Returns diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index cbac452104816..ae9433e0f6981 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -25,12 +25,18 @@ def test_fetch(fetch_covtype_fxt): check_return_X_y(data1, fetch_func) -def test_fetch_asframe_shape(fetch_covtype_fxt): +def test_fetch_asframe(fetch_covtype_fxt): bunch = fetch_covtype_fxt(as_frame=True) assert hasattr(bunch, 'frame') frame = bunch.frame assert frame.shape == (581012, 55) + column_names = set(frame.columns) + + # enumerated names are added correctly + assert set(f"Wilderness_Area_{i}" for i in range(4)) < column_names + assert set(f"Soil_Type_{i}" for i in range(40)) < column_names + def test_pandas_dependency_message(fetch_covtype_fxt, hide_available_pandas): From 6729f9a6f101ea94e884ae411bda44a995a568b8 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 20 Jun 2020 12:33:56 -0400 Subject: [PATCH 11/16] BLD [scipy-dev] From 7b77c03ec4671e2d1292b5997918143e2b436d18 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 20 Jun 2020 13:27:34 -0400 Subject: [PATCH 12/16] DOC Use comment --- sklearn/datasets/_covtype.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 422ba293dffaa..653fe5d286247 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -42,10 +42,8 @@ logger = logging.getLogger(__name__) -""" -Column names reference: -https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info -""" +# Column names reference: +# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info FEATURE_NAMES = ["Elevation", "Aspect", "Slope", From f9ee16235f74eba2f8e2abdf61f941015a1af683 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 20 Jun 2020 13:54:44 -0400 Subject: [PATCH 13/16] DOC documents target and feature names --- sklearn/datasets/_covtype.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 653fe5d286247..0f3093807a83a 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -124,6 +124,10 @@ def fetch_covtype(*, data_home=None, download_if_missing=True, Only present when `as_frame=True`. Contains `data` and `target`. DESCR : str Description of the forest covertype dataset. + feature_names : list + The names of the dataset columns + target_names: list + The names of the target columns (data, target) : tuple if ``return_X_y`` is True From 5ecc8202817694ead947cdaccb001f3c6e19d3e2 Mon Sep 17 00:00:00 2001 From: aliang Date: Sat, 20 Jun 2020 22:09:09 -0400 Subject: [PATCH 14/16] added what's new --- doc/whats_new/v0.24.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 6982079525f72..31c548e7de0b5 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -53,6 +53,12 @@ Changelog unless data is sparse. :pr:`17396` by :user:`Jiaxiang `. +- |Enhancement| :func:`datasets.fetch_covtype` now now supports optional + argument `as_frame`; when it is set to true, the returned Bunch object's + `data` and `target` members are in pandas DataFrame format, and the Bunch + object will also have an additional `frame` member as a pandas DataFrame. + :pr:`17491` by :user:`Alex Liang `. + :mod:`sklearn.decomposition` ............................ From 335b4bfe73e2422a027065bc91d571723fecf77f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 20 Jun 2020 22:26:10 -0400 Subject: [PATCH 15/16] DOC Update whats enw --- doc/whats_new/v0.24.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 31c548e7de0b5..a479edd25342b 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -53,11 +53,10 @@ Changelog unless data is sparse. :pr:`17396` by :user:`Jiaxiang `. -- |Enhancement| :func:`datasets.fetch_covtype` now now supports optional - argument `as_frame`; when it is set to true, the returned Bunch object's - `data` and `target` members are in pandas DataFrame format, and the Bunch - object will also have an additional `frame` member as a pandas DataFrame. - :pr:`17491` by :user:`Alex Liang `. +- |Enhancement| :func:`datasets.fetch_covtype` now now supports the optional + argument `as_frame`; when it is set to True, the returned Bunch object's + `data` and `frame` members are pandas DataFrames, and the `target` member is + a pandas Series. :pr:`17491` by :user:`Alex Liang `. :mod:`sklearn.decomposition` ............................ From 85718f32260bbb16b9e4aacb74f8b185ec315a81 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 20 Jun 2020 22:26:42 -0400 Subject: [PATCH 16/16] TST Adds one more test --- sklearn/datasets/tests/test_covtype.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py index ae9433e0f6981..df0989d66bb3a 100644 --- a/sklearn/datasets/tests/test_covtype.py +++ b/sklearn/datasets/tests/test_covtype.py @@ -30,6 +30,8 @@ def test_fetch_asframe(fetch_covtype_fxt): assert hasattr(bunch, 'frame') frame = bunch.frame assert frame.shape == (581012, 55) + assert bunch.data.shape == (581012, 54) + assert bunch.target.shape == (581012,) column_names = set(frame.columns)