New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Take over PR #7647 - Add a "filename" attribute to datasets that have a CSV file #9101

Merged
merged 23 commits into from Dec 4, 2017
Commits
Jump to file or symbol
Failed to load files and symbols.
+51 −22
Diff settings

Always

Just for now

@@ -136,7 +136,7 @@ learn::
<sphx_glr_auto_examples_classification_plot_digits_classification.py>` illustrates how starting
from the original problem one can shape the data for consumption in
scikit-learn.
.. topic:: Loading from external datasets
To load from an external dataset, please refer to :ref:`loading external datasets <external_datasets>`.
@@ -401,4 +401,4 @@ is similarly possible for an instance to be assigned multiple labels::
In this case, the classifier is fit upon instances each assigned multiple labels.
The :class:`MultiLabelBinarizer <sklearn.preprocessing.MultiLabelBinarizer>` is
used to binarize the 2d array of multilabels to ``fit`` upon. As a result,
``predict()`` returns a 2d array with multiple predicted labels for each instance.
``predict()`` returns a 2d array with multiple predicted labels for each instance.
View
@@ -352,8 +352,9 @@ def load_iris(return_X_y=False):
Dictionary-like object, the interesting attributes are:
'data', the data to learn, 'target', the classification labels,
'target_names', the meaning of the labels, 'feature_names', the
meaning of the features, and 'DESCR', the
full description of the dataset.
meaning of the features, 'DESCR', the full description of
the dataset, 'filename', the physical location of
iris csv dataset (added in version `0.20`).
(data, target) : tuple if ``return_X_y`` is True
@@ -373,6 +374,7 @@ def load_iris(return_X_y=False):
"""
module_path = dirname(__file__)
data, target, target_names = load_data(module_path, 'iris.csv')
iris_csv_filename = join(module_path, 'data', 'iris.csv')
with open(join(module_path, 'descr', 'iris.rst')) as rst_file:
fdescr = rst_file.read()
@@ -384,7 +386,8 @@ def load_iris(return_X_y=False):
target_names=target_names,
DESCR=fdescr,
feature_names=['sepal length (cm)', 'sepal width (cm)',
'petal length (cm)', 'petal width (cm)'])
'petal length (cm)', 'petal width (cm)'],
filename=iris_csv_filename)
def load_breast_cancer(return_X_y=False):
@@ -415,8 +418,9 @@ def load_breast_cancer(return_X_y=False):
Dictionary-like object, the interesting attributes are:
'data', the data to learn, 'target', the classification labels,
'target_names', the meaning of the labels, 'feature_names', the
meaning of the features, and 'DESCR', the
full description of the dataset.
meaning of the features, and 'DESCR', the full description of
the dataset, 'filename', the physical location of
breast cancer csv dataset (added in version `0.20`).
(data, target) : tuple if ``return_X_y`` is True
@@ -440,6 +444,7 @@ def load_breast_cancer(return_X_y=False):
"""
module_path = dirname(__file__)
data, target, target_names = load_data(module_path, 'breast_cancer.csv')
csv_filename = join(module_path, 'data', 'breast_cancer.csv')
with open(join(module_path, 'descr', 'breast_cancer.rst')) as rst_file:
fdescr = rst_file.read()
@@ -466,7 +471,8 @@ def load_breast_cancer(return_X_y=False):
return Bunch(data=data, target=target,
target_names=target_names,
DESCR=fdescr,
feature_names=feature_names)
feature_names=feature_names,
filename=csv_filename)
def load_digits(n_class=10, return_X_y=False):
@@ -573,18 +579,21 @@ def load_diabetes(return_X_y=False):
-------
data : Bunch
Dictionary-like object, the interesting attributes are:
'data', the data to learn and 'target', the regression target for each
sample.
'data', the data to learn, 'target', the regression target for each
sample, 'data_filename', the physical location
of diabetes data csv dataset, and 'target_filename', the physical
location of diabetes targets csv datataset (added in version `0.20`).
(data, target) : tuple if ``return_X_y`` is True
.. versionadded:: 0.18
"""
module_path = dirname(__file__)
base_dir = join(module_path, 'data')
data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz'))
target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz'))
data_filename = join(base_dir, 'diabetes_data.csv.gz')
data = np.loadtxt(data_filename)
target_filename = join(base_dir, 'diabetes_target.csv.gz')
target = np.loadtxt(target_filename)
with open(join(module_path, 'descr', 'diabetes.rst')) as rst_file:
fdescr = rst_file.read()
@@ -594,7 +603,9 @@ def load_diabetes(return_X_y=False):
return Bunch(data=data, target=target, DESCR=fdescr,
feature_names=['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6'])
's1', 's2', 's3', 's4', 's5', 's6'],
data_filename=data_filename,
target_filename=target_filename)
def load_linnerud(return_X_y=False):
@@ -622,21 +633,29 @@ def load_linnerud(return_X_y=False):
'targets', the two multivariate datasets, with 'data' corresponding to
the exercise and 'targets' corresponding to the physiological
measurements, as well as 'feature_names' and 'target_names'.
In addition, you will also have access to 'data_filename',
the physical location of linnerud data csv dataset, and
'target_filename', the physical location of
linnerud targets csv datataset (added in version `0.20`).
(data, target) : tuple if ``return_X_y`` is True
.. versionadded:: 0.18
"""
base_dir = join(dirname(__file__), 'data/')
data_filename = join(base_dir, 'linnerud_exercise.csv')
target_filename = join(base_dir, 'linnerud_physiological.csv')
# Read data
data_exercise = np.loadtxt(base_dir + 'linnerud_exercise.csv', skiprows=1)
data_physiological = np.loadtxt(base_dir + 'linnerud_physiological.csv',
skiprows=1)
data_exercise = np.loadtxt(data_filename, skiprows=1)
data_physiological = np.loadtxt(target_filename, skiprows=1)
# Read header
with open(base_dir + 'linnerud_exercise.csv') as f:
with open(data_filename) as f:
header_exercise = f.readline().split()
with open(base_dir + 'linnerud_physiological.csv') as f:
with open(target_filename) as f:
header_physiological = f.readline().split()
with open(dirname(__file__) + '/descr/linnerud.rst') as f:
descr = f.read()
@@ -646,7 +665,9 @@ def load_linnerud(return_X_y=False):
return Bunch(data=data_exercise, feature_names=header_exercise,
target=data_physiological,
target_names=header_physiological,
DESCR=descr)
DESCR=descr,
data_filename=data_filename,
target_filename=target_filename)
def load_boston(return_X_y=False):
@@ -672,7 +693,9 @@ def load_boston(return_X_y=False):
data : Bunch
Dictionary-like object, the interesting attributes are:
'data', the data to learn, 'target', the regression targets,
and 'DESCR', the full description of the dataset.
'DESCR', the full description of the dataset,
and 'filename', the physical location of boston
csv dataset (added in version `0.20`).
(data, target) : tuple if ``return_X_y`` is True
@@ -713,7 +736,8 @@ def load_boston(return_X_y=False):
target=target,
# last column is target value
feature_names=feature_names[:-1],
DESCR=descr_text)
DESCR=descr_text,
filename=data_file_name)
def load_sample_images():
@@ -197,6 +197,8 @@ def test_load_linnerud():
assert_equal(res.target.shape, (20, 3))
assert_equal(len(res.target_names), 3)
assert_true(res.DESCR)
assert_true(os.path.exists(res.data_filename))
assert_true(os.path.exists(res.target_filename))
# test return_X_y option
X_y_tuple = load_linnerud(return_X_y=True)
@@ -212,6 +214,7 @@ def test_load_iris():
assert_equal(res.target.size, 150)
assert_equal(res.target_names.size, 3)
assert_true(res.DESCR)
assert_true(os.path.exists(res.filename))
# test return_X_y option
X_y_tuple = load_iris(return_X_y=True)
@@ -242,6 +245,7 @@ def test_load_breast_cancer():
assert_equal(res.target.size, 569)
assert_equal(res.target_names.size, 2)
assert_true(res.DESCR)
assert_true(os.path.exists(res.filename))
# test return_X_y option
X_y_tuple = load_breast_cancer(return_X_y=True)
@@ -257,6 +261,7 @@ def test_load_boston():
assert_equal(res.target.size, 506)
assert_equal(res.feature_names.size, 13)
assert_true(res.DESCR)
assert_true(os.path.exists(res.filename))
# test return_X_y option
X_y_tuple = load_boston(return_X_y=True)
ProTip! Use n and p to navigate between commits in a pull request.