New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Take over PR #7647 - Add a "filename" attribute to datasets that have a CSV file #9101

Merged
merged 23 commits into from Dec 4, 2017
Commits
Jump to file or symbol
Failed to load files and symbols.
+82 −21
Diff settings

Always

Just for now

@@ -136,11 +136,43 @@ learn::
<sphx_glr_auto_examples_classification_plot_digits_classification.py>` illustrates how starting
from the original problem one can shape the data for consumption in
scikit-learn.
.. topic:: Loading from external datasets
To load from an external dataset, please refer to :ref:`loading external datasets <external_datasets>`.
.. topic:: Loading from the data files

This comment has been minimized.

@jnothman

jnothman Sep 26, 2017

Member

I don't get why this belongs in the tutorial, unless it's framed as "You can also load your own data. For example, load_boston(...) just pulls in data using numpy.loadtxt::". This currently appears to be too much detail on the internals of scikit-learn.

This comment has been minimized.

@maskani-moh

maskani-moh Sep 26, 2017

Contributor

@jnothman, I agree with you, no need to mention the filename attribute in the tutorial. Too much detail for a tutorial.
Should I remove this section then?

All standard datasets which you can import with ``load_`` have underlying source files that
you can read manually (consider :func:`numpy.loadtxt` and `pandas <http://pandas.pydata.org/>`_
for analysis). The data and target can be stored in one file (e.g. iris, boston, breast_cancer) or
in several (e.g. diabetes, linnerud).
>>> from sklearn.datasets import load_boston
>>> boston = load_boston()
>>> print(boston.filename) # doctest: +SKIP
(some-path)/sklearn/datasets/data/boston_house_prices.csv
>>> from sklearn.datasets import load_diabetes
>>> diabetes = load_diabetes()
>>> print(diabetes.data_filename) # doctest: +SKIP
(some-path)/sklearn/datasets/data/diabetes_data.csv.gz
>>> print(diabetes.target_filename) # doctest: +SKIP
(some-path)/sklearn/datasets/data/diabetes_target.csv.gz
You can also read the data file directly with numpy. Consider the following example.
Boston dataset contains 2 header lines, that is why we are going to skip them::
>>> import numpy as np
>>> boston_data = np.loadtxt(boston.filename, delimiter=",", skiprows=2)
>>> boston.data.shape # sklearn dataset
(506, 13)
>>> boston_data.shape # also contains target columns
(506, 14)
.. seealso::
`pandas.read_csv <http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html>`_
Learning and predicting
------------------------
@@ -401,4 +433,4 @@ is similarly possible for an instance to be assigned multiple labels::
In this case, the classifier is fit upon instances each assigned multiple labels.
The :class:`MultiLabelBinarizer <sklearn.preprocessing.MultiLabelBinarizer>` is
used to binarize the 2d array of multilabels to ``fit`` upon. As a result,
``predict()`` returns a 2d array with multiple predicted labels for each instance.
``predict()`` returns a 2d array with multiple predicted labels for each instance.
View
@@ -352,8 +352,9 @@ def load_iris(return_X_y=False):
Dictionary-like object, the interesting attributes are:
'data', the data to learn, 'target', the classification labels,
'target_names', the meaning of the labels, 'feature_names', the
meaning of the features, and 'DESCR', the
full description of the dataset.
meaning of the features, 'DESCR', the
full description of the dataset, 'filename' (added in version 0.19),

This comment has been minimized.

@amueller

amueller Sep 21, 2017

Member

I would put the 0.20 (because 0.19 has been released) below in a separate ..versionchanged: section. That's easier to keep track of if someone is looking for changes, and usually it's irrelevant.

the physical location of iris csv dataset.
(data, target) : tuple if ``return_X_y`` is True
@@ -373,6 +374,7 @@ def load_iris(return_X_y=False):
"""
module_path = dirname(__file__)
data, target, target_names = load_data(module_path, 'iris.csv')
iris_csv_filename = join(module_path, 'data', 'iris.csv')
with open(join(module_path, 'descr', 'iris.rst')) as rst_file:
fdescr = rst_file.read()
@@ -384,7 +386,8 @@ def load_iris(return_X_y=False):
target_names=target_names,
DESCR=fdescr,
feature_names=['sepal length (cm)', 'sepal width (cm)',
'petal length (cm)', 'petal width (cm)'])
'petal length (cm)', 'petal width (cm)'],
filename=iris_csv_filename)
def load_breast_cancer(return_X_y=False):
@@ -416,7 +419,8 @@ def load_breast_cancer(return_X_y=False):
'data', the data to learn, 'target', the classification labels,
'target_names', the meaning of the labels, 'feature_names', the
meaning of the features, and 'DESCR', the
full description of the dataset.
full description of the dataset, 'filename' (added in version 0.19),
the physical location of breast cancer csv dataset.
(data, target) : tuple if ``return_X_y`` is True
@@ -440,6 +444,7 @@ def load_breast_cancer(return_X_y=False):
"""
module_path = dirname(__file__)
data, target, target_names = load_data(module_path, 'breast_cancer.csv')
csv_filename = join(module_path, 'data', 'breast_cancer.csv')
with open(join(module_path, 'descr', 'breast_cancer.rst')) as rst_file:
fdescr = rst_file.read()
@@ -466,7 +471,8 @@ def load_breast_cancer(return_X_y=False):
return Bunch(data=data, target=target,
target_names=target_names,
DESCR=fdescr,
feature_names=feature_names)
feature_names=feature_names,
filename=csv_filename)
def load_digits(n_class=10, return_X_y=False):
@@ -573,18 +579,21 @@ def load_diabetes(return_X_y=False):
-------
data : Bunch
Dictionary-like object, the interesting attributes are:
'data', the data to learn and 'target', the regression target for each
sample.
'data', the data to learn, 'target', the regression target for each
sample, 'data_filename' (added in version 0.19), the physical location
of diabetes data csv dataset, and 'target_filename' (added in
version 0.19), the physical location of diabetes targets csv datataset.
(data, target) : tuple if ``return_X_y`` is True
.. versionadded:: 0.18
"""
module_path = dirname(__file__)
base_dir = join(module_path, 'data')
data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz'))
target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz'))
data_filename = join(base_dir, 'diabetes_data.csv.gz')
data = np.loadtxt(data_filename)
target_filename = join(base_dir, 'diabetes_target.csv.gz')
target = np.loadtxt(target_filename)
with open(join(module_path, 'descr', 'diabetes.rst')) as rst_file:
fdescr = rst_file.read()
@@ -594,7 +603,9 @@ def load_diabetes(return_X_y=False):
return Bunch(data=data, target=target, DESCR=fdescr,
feature_names=['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6'])
's1', 's2', 's3', 's4', 's5', 's6'],
data_filename=data_filename,
target_filename=target_filename)
def load_linnerud(return_X_y=False):
@@ -622,21 +633,29 @@ def load_linnerud(return_X_y=False):
'targets', the two multivariate datasets, with 'data' corresponding to
the exercise and 'targets' corresponding to the physiological
measurements, as well as 'feature_names' and 'target_names'.
In addition, you will also have access to 'data_filename'
(added in version 0.19), the physical location of linnerud data csv
dataset, and 'target_filename' (added in version 0.19), the
physical location of linnerud targets csv datataset.
(data, target) : tuple if ``return_X_y`` is True
.. versionadded:: 0.18
"""
base_dir = join(dirname(__file__), 'data/')
data_filename = join(base_dir, 'linnerud_exercise.csv')
target_filename = join(base_dir, 'linnerud_physiological.csv')
# Read data
data_exercise = np.loadtxt(base_dir + 'linnerud_exercise.csv', skiprows=1)
data_physiological = np.loadtxt(base_dir + 'linnerud_physiological.csv',
skiprows=1)
data_exercise = np.loadtxt(data_filename, skiprows=1)
data_physiological = np.loadtxt(target_filename, skiprows=1)
# Read header
with open(base_dir + 'linnerud_exercise.csv') as f:
with open(data_filename) as f:
header_exercise = f.readline().split()
with open(base_dir + 'linnerud_physiological.csv') as f:
with open(target_filename) as f:
header_physiological = f.readline().split()
with open(dirname(__file__) + '/descr/linnerud.rst') as f:
descr = f.read()
@@ -646,7 +665,9 @@ def load_linnerud(return_X_y=False):
return Bunch(data=data_exercise, feature_names=header_exercise,
target=data_physiological,
target_names=header_physiological,
DESCR=descr)
DESCR=descr,
data_filename=data_filename,
target_filename=target_filename)
def load_boston(return_X_y=False):
@@ -672,7 +693,9 @@ def load_boston(return_X_y=False):
data : Bunch
Dictionary-like object, the interesting attributes are:
'data', the data to learn, 'target', the regression targets,
and 'DESCR', the full description of the dataset.
'DESCR', the full description of the dataset,
and 'filename' (added in version 0.19), the physical location
of boston csv dataset.
(data, target) : tuple if ``return_X_y`` is True
@@ -713,7 +736,8 @@ def load_boston(return_X_y=False):
target=target,
# last column is target value
feature_names=feature_names[:-1],
DESCR=descr_text)
DESCR=descr_text,
filename=data_file_name)
def load_sample_images():
@@ -189,6 +189,8 @@ def test_load_linnerud():
assert_equal(res.target.shape, (20, 3))
assert_equal(len(res.target_names), 3)
assert_true(res.DESCR)
assert_true(os.path.exists(res.data_filename))
assert_true(os.path.exists(res.target_filename))
# test return_X_y option
X_y_tuple = load_linnerud(return_X_y=True)
@@ -204,6 +206,7 @@ def test_load_iris():
assert_equal(res.target.size, 150)
assert_equal(res.target_names.size, 3)
assert_true(res.DESCR)
assert_true(os.path.exists(res.filename))
# test return_X_y option
X_y_tuple = load_iris(return_X_y=True)
@@ -234,6 +237,7 @@ def test_load_breast_cancer():
assert_equal(res.target.size, 569)
assert_equal(res.target_names.size, 2)
assert_true(res.DESCR)
assert_true(os.path.exists(res.filename))
# test return_X_y option
X_y_tuple = load_breast_cancer(return_X_y=True)
@@ -249,6 +253,7 @@ def test_load_boston():
assert_equal(res.target.size, 506)
assert_equal(res.feature_names.size, 13)
assert_true(res.DESCR)
assert_true(os.path.exists(res.filename))
# test return_X_y option
X_y_tuple = load_boston(return_X_y=True)
ProTip! Use n and p to navigate between commits in a pull request.