-
Notifications
You must be signed in to change notification settings - Fork 177
/
dataset.py
436 lines (360 loc) · 13.5 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
import arff
import bz2
import pickle
from scipy import sparse
import hashlib
import os
import requests
import shutil
from collections import defaultdict
def get_data_home(data_home=None, subdirectory=""):
"""Return the path of the scikit-multilearn data dir.
This folder is used by some large dataset loaders to avoid
downloading the data several times.
By default the :code:`data_home` is set to a folder named
:code:`'scikit_ml_learn_data'` in the user home folder.
Alternatively, it can be set by the :code:`'SCIKIT_ML_LEARN_DATA'`
environment variable or programmatically by giving an explicit
folder path. The :code:`'~'` symbol is expanded to the user home
folder.
If the folder does not already exist, it is automatically created.
Parameters
----------
data_home : str (default is None)
the path to the directory in which scikit-multilearn data sets
should be stored, if None the path is generated as stated above
subdirectory : str, default ''
return path subdirectory under data_home if data_home passed or under default if not passed
Returns
--------
str
the path to the data home
"""
if data_home is None:
if len(subdirectory) > 0:
data_home = os.environ.get(
"SCIKIT_ML_LEARN_DATA",
os.path.join("~", "scikit_ml_learn_data", subdirectory),
)
else:
data_home = os.environ.get(
"SCIKIT_ML_LEARN_DATA", os.path.join("~", "scikit_ml_learn_data")
)
data_home = os.path.expanduser(data_home)
if not os.path.exists(data_home):
os.makedirs(data_home)
return data_home
def clear_data_home(data_home=None):
"""Delete all the content of the data home cache.
Parameters
----------
data_home : str (default is None)
the path to the directory in which scikit-multilearn data sets
should be stored.
"""
data_home = get_data_home(data_home)
shutil.rmtree(data_home)
def _get_download_base_url():
"""Returns base URL for data sets."""
return "http://scikit.ml/datasets/"
def available_data_sets():
"""Lists available data sets and their variants
Returns
-------
dict[(set_name, variant_name)] -> [md5, file_name]
available datasets and their variants with the key pertaining
to the :code:`(set_name, variant_name)` and values include md5 and file name on server
"""
r = requests.get(_get_download_base_url() + "data.list")
if r.status_code != 200:
r.raise_for_status()
else:
raw_data_list = r.text
variant_information = defaultdict(list)
for row in raw_data_list.split("\n"):
md5, file_name = row.split(";")
set_name, variant = file_name.split(".")[0].split("-")
if (set_name, variant) in variant_information:
raise Exception(
"Data file broken, files doubled, please file bug report."
)
variant_information[(set_name, variant)] = [md5, file_name]
return variant_information
def download_dataset(set_name, variant, data_home=None):
"""Downloads a data set
Parameters
----------
set_name : str
name of set from :func:`available_data_sets`
variant : str
variant of the data set from :func:`available_data_sets`
data_home : default None, str
custom base folder for data, if None, default is used
Returns
-------
str
path to the downloaded data set file on disk
"""
data_sets = available_data_sets()
if (set_name, variant) not in data_sets:
raise ValueError(
"The set {} in variant {} does not exist on server.".format(
set_name, variant
)
)
md5, name = data_sets[set_name, variant]
if data_home is None:
target_name = os.path.join(get_data_home(), name)
else:
target_name = os.path.join(data_home, name)
if os.path.exists(target_name):
if md5 == _get_md5(target_name):
print("{}:{} - exists, not redownloading".format(set_name, variant))
return target_name
else:
print(
"{}:{} - exists, but MD5 sum mismatch - redownloading".format(
set_name, variant
)
)
else:
print("{}:{} - does not exists downloading".format(set_name, variant))
# not found or broken md5
_download_single_file(name, target_name)
found_md5 = _get_md5(target_name)
if md5 != found_md5:
raise Exception(
"{}: MD5 mismatch {} vs {} - possible download error".format(
name, md5, found_md5
)
)
print("Downloaded {}-{}".format(set_name, variant))
return target_name
def load_dataset(set_name, variant, data_home=None):
"""Loads a selected variant of the given data set
Parameters
----------
set_name : str
name of set from :func:`available_data_sets`
variant : str
variant of the data set
data_home : default None, str
custom base folder for data, if None, default is used
Returns
--------
dict
the loaded multilabel data set variant in the scikit-multilearn
format, see data_sets
"""
path = download_dataset(set_name, variant, data_home)
if path is not None:
return load_dataset_dump(path)
return None
def load_from_arff(
filename,
label_count,
label_location="end",
input_feature_type="float",
encode_nominal=True,
load_sparse=False,
return_attribute_definitions=False,
):
"""Method for loading ARFF files as numpy array
Parameters
----------
filename : str
path to ARFF file
labelcount: integer
number of labels in the ARFF file
endian: str {"big", "little"} (default is "big")
whether the ARFF file contains labels at the beginning of the
attributes list ("start", MEKA format)
or at the end ("end", MULAN format)
input_feature_type: numpy.type as string (default is "float")
the desire type of the contents of the return 'X' array-likes,
default 'i8', should be a numpy type,
see http://docs.scipy.org/doc/numpy/user/basics.types.html
encode_nominal: bool (default is True)
whether convert categorical data into numeric factors - required
for some scikit classifiers that can't handle non-numeric
input features.
load_sparse: boolean (default is False)
whether to read arff file as a sparse file format, liac-arff
breaks if sparse reading is enabled for non-sparse ARFFs.
return_attribute_definitions: boolean (default is False)
whether to return the definitions for each attribute in the
dataset
Returns
-------
X : :mod:`scipy.sparse.lil_matrix` of `input_feature_type`, shape=(n_samples, n_features)
input feature matrix
y : :mod:`scipy.sparse.lil_matrix` of `{0, 1}`, shape=(n_samples, n_labels)
binary indicator matrix with label assignments
names of attributes : List[str]
list of attribute names from ARFF file
"""
if not load_sparse:
arff_frame = arff.load(
open(filename, "r"), encode_nominal=encode_nominal, return_type=arff.DENSE
)
matrix = sparse.csr_matrix(arff_frame["data"], dtype=input_feature_type)
else:
arff_frame = arff.load(
open(filename, "r"), encode_nominal=encode_nominal, return_type=arff.COO
)
data = arff_frame["data"][0]
row = arff_frame["data"][1]
col = arff_frame["data"][2]
matrix = sparse.coo_matrix(
(data, (row, col)), shape=(max(row) + 1, max(col) + 1)
)
if label_location == "start":
X, y = (
matrix.tocsc()[:, label_count:].tolil(),
matrix.tocsc()[:, :label_count].astype(int).tolil(),
)
feature_names = arff_frame["attributes"][label_count:]
label_names = arff_frame["attributes"][:label_count]
elif label_location == "end":
X, y = (
matrix.tocsc()[:, :-label_count].tolil(),
matrix.tocsc()[:, -label_count:].astype(int).tolil(),
)
feature_names = arff_frame["attributes"][:-label_count]
label_names = arff_frame["attributes"][-label_count:]
else:
# unknown endian
return None
if return_attribute_definitions:
return X, y, feature_names, label_names
else:
return X, y
def save_to_arff(X, y, label_location="end", save_sparse=True, filename=None):
"""Method for dumping data to ARFF files
Parameters
----------
X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
input feature matrix
y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
binary indicator matrix with label assignments
label_location: string {"start", "end"} (default is "end")
whether the ARFF file will contain labels at the beginning of the
attributes list ("start", MEKA format)
or at the end ("end", MULAN format)
save_sparse: boolean
Whether to save in ARFF's sparse dictionary-like format instead of listing all
zeroes within file, very useful in multi-label classification.
filename : str or None
Path to ARFF file, if None, the ARFF representation is returned as string
Returns
-------
str or None
the ARFF dump string, if filename is None
"""
X = X.todok()
y = y.todok()
x_prefix = 0
y_prefix = 0
x_attributes = [("X{}".format(i), "NUMERIC") for i in range(X.shape[1])]
y_attributes = [("y{}".format(i), [str(0), str(1)]) for i in range(y.shape[1])]
if label_location == "end":
y_prefix = X.shape[1]
relation_sign = -1
attributes = x_attributes + y_attributes
elif label_location == "start":
x_prefix = y.shape[1]
relation_sign = 1
attributes = y_attributes + x_attributes
else:
raise ValueError("Label location not in {start, end}")
if save_sparse:
data = [{} for r in range(X.shape[0])]
else:
data = [[0 for c in range(X.shape[1] + y.shape[1])] for r in range(X.shape[0])]
for keys, value in list(X.items()):
data[keys[0]][x_prefix + keys[1]] = value
for keys, value in list(y.items()):
data[keys[0]][y_prefix + keys[1]] = value
dataset = {
"description": "traindata",
"relation": "traindata: -C {}".format(y.shape[1] * relation_sign),
"attributes": attributes,
"data": data,
}
arff_data = arff.dumps(dataset)
if filename is None:
return arff_data
with open(filename, "w") as fp:
fp.write(arff_data)
def save_dataset_dump(input_space, labels, feature_names, label_names, filename=None):
"""Saves a compressed data set dump
Parameters
----------
input_space: array-like of array-likes
Input space array-like of input feature vectors
labels: array-like of binary label vectors
Array-like of labels assigned to each input vector, as a binary
indicator vector (i.e. if 5th position has value 1
then the input vector has label no. 5)
feature_names: array-like,optional
names of features
label_names: array-like, optional
names of labels
filename : str, optional
Path to dump file, if without .bz2, the .bz2 extension will be
appended.
"""
data = {
"X": input_space,
"y": labels,
"features": feature_names,
"labels": label_names,
}
if filename is not None:
if filename[-4:] != ".bz2":
filename += ".bz2"
with bz2.BZ2File(filename, "wb") as file_handle:
pickle.dump(data, file_handle)
else:
return data
def load_dataset_dump(filename):
"""Loads a compressed data set dump
Parameters
----------
filename : str
path to dump file, if without .bz2 ending, the .bz2 extension will be appended.
Returns
-------
X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
input feature matrix
y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
binary indicator matrix with label assignments
names of attributes: List[str]
list of attribute names for `X` columns
names of labels: List[str]
list of label names for `y` columns
"""
if not os.path.exists(filename):
raise IOError(
"File {} does not exist, use load_dataset to download file".format(filename)
)
if filename[-4:] != ".bz2":
filename += ".bz2"
with bz2.BZ2File(filename, "r") as file_handle:
data = pickle.load(file_handle)
return data["X"], data["y"], data["features"], data["labels"]
def _download_single_file(data_file_name, target_file_name, base_url=None):
base_url = base_url or _get_download_base_url()
r = requests.get(base_url + data_file_name, stream=True)
if r.status_code == 200:
with open(target_file_name, "wb") as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
else:
r.raise_for_status()
def _get_md5(file_name):
hash_md5 = hashlib.md5()
with open(file_name, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()