/
dataset.py
728 lines (556 loc) · 24.5 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 1 20:19:59 2017
@author: pablotempone
"""
"""
the :mod:`dataset` module defines some tools for managing datasets.
Users may use both *built-in* and user-defined datasets (see the
:ref:`getting_started` page for examples). Right now, three built-in datasets
are available:
* The `movielens-100k <http://grouplens.org/datasets/movielens/>`_ dataset.
* The `movielens-1m <http://grouplens.org/datasets/movielens/>`_ dataset.
* The `Jester <http://eigentaste.berkeley.edu/dataset/>`_ dataset 2.
Built-in datasets can all be loaded (or downloaded if you haven't already)
using the :meth:`Dataset.load_builtin` method. For each built-in dataset,
Surprise also provide predefined :class:`readers <Reader>` which are useful if
you want to use a custom dataset that has the same format as a built-in one.
Summary:
.. autosummary::
:nosignatures:
Dataset.load_builtin
Dataset.load_from_file
Dataset.load_from_folds
Dataset.folds
DatasetAutoFolds.split
Reader
Trainset
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from collections import defaultdict
from collections import namedtuple
import sys
import os
import zipfile
import itertools
import random
import numpy as np
from six.moves import input
from six.moves.urllib.request import urlretrieve
from six.moves import range
from six import iteritems
# directory where builtin datasets are stored. For now it's in the home
# directory under the .surprise_data. May be ask user to define it?
DATASETS_DIR = os.path.expanduser('~') + '/.surprise_data/'
# a builtin dataset has
# - an url (where to download it)
# - a path (where it is located on the filesystem)
# - the parameters of the corresponding reader
BuiltinDataset = namedtuple('BuiltinDataset', ['url', 'path', 'reader_params'])
BUILTIN_DATASETS = {
'ml-100k':
BuiltinDataset(
url='http://files.grouplens.org/datasets/movielens/ml-100k.zip',
path=DATASETS_DIR + 'ml-100k/ml-100k/u.data',
reader_params=dict(line_format='user item rating timestamp',
rating_scale=(1, 5),
sep='\t')
),
'ml-1m':
BuiltinDataset(
url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
path=DATASETS_DIR + 'ml-1m/ml-1m/ratings.dat',
reader_params=dict(line_format='user item rating timestamp',
rating_scale=(1, 5),
sep='::')
),
'jester':
BuiltinDataset(
url='http://eigentaste.berkeley.edu/dataset/jester_dataset_2.zip',
path=DATASETS_DIR + 'jester/jester_ratings.dat',
reader_params=dict(line_format='user item rating',
rating_scale=(-10, 10))
)
}
class Dataset:
"""Base class for loading datasets.
Note that you should never instantiate the :class:`Dataset` class directly
(same goes for its derived classes), but instead use one of the three
available methods for loading datasets."""
def __init__(self, reader):
self.reader = reader
@classmethod
def load_builtin(cls, name='ml-100k'):
"""Load a built-in dataset.
If the dataset has not already been loaded, it will be downloaded and
saved. You will have to split your dataset using the :meth:`split
<DatasetAutoFolds.split>` method. See an example in the :ref:`User
Guide <load_builtin_example>`.
Args:
name(:obj:`string`): The name of the built-in dataset to load.
Accepted values are 'ml-100k', 'ml-1m', and 'jester'.
Default is 'ml-100k'.
Returns:
A :obj:`Dataset` object.
Raises:
ValueError: If the ``name`` parameter is incorrect.
"""
try:
dataset = BUILTIN_DATASETS[name]
except KeyError:
raise ValueError('unknown dataset ' + name +
'. Accepted values are ' +
', '.join(BUILTIN_DATASETS.keys()) + '.')
# if dataset does not exist, offer to download it
if not os.path.isfile(dataset.path):
answered = False
while not answered:
print('Dataset ' + name + ' could not be found. Do you want '
'to download it? [Y/n] ', end='')
choice = input().lower()
if choice in ['yes', 'y', '', 'omg this is so nice of you!!']:
answered = True
elif choice in ['no', 'n', 'hell no why would i want that?!']:
answered = True
print("Ok then, I'm out!")
sys.exit()
if not os.path.exists(DATASETS_DIR):
os.makedirs(DATASETS_DIR)
print('Trying to download dataset from ' + dataset.url + '...')
urlretrieve(dataset.url, DATASETS_DIR + 'tmp.zip')
with zipfile.ZipFile(DATASETS_DIR + 'tmp.zip', 'r') as tmp_zip:
tmp_zip.extractall(DATASETS_DIR + name)
os.remove(DATASETS_DIR + 'tmp.zip')
print('Done! Dataset', name, 'has been saved to', DATASETS_DIR +
name)
reader = Reader(**dataset.reader_params)
return cls.load_from_file(file_path=dataset.path, reader=reader)
@classmethod
def load_from_file(cls, file_path, reader):
"""Load a dataset from a (custom) file.
Use this if you want to use a custom dataset and all of the ratings are
stored in one file. You will have to split your dataset using the
:meth:`split <DatasetAutoFolds.split>` method. See an example in the
:ref:`User Guide <load_from_file_example>`.
Args:
file_path(:obj:`string`): The path to the file containing ratings.
reader(:obj:`Reader`): A reader to read the file.
"""
return DatasetAutoFolds(ratings_file=file_path, reader=reader)
@classmethod
def load_from_folds(cls, folds_files, reader):
"""Load a dataset where folds (for cross-validation) are predefined by
some files.
The purpose of this method is to cover a common use case where a
dataset is already split into predefined folds, such as the
movielens-100k dataset which defines files u1.base, u1.test, u2.base,
u2.test, etc... It can also be used when you don't want to perform
cross-validation but still want to specify your training and testing
data (which comes down to 1-fold cross-validation anyway). See an
example in the :ref:`User Guide <load_from_folds_example>`.
Args:
folds_files(:obj:`iterable` of :obj:`tuples`): The list of the
folds. A fold is a tuple of the form ``(path_to_train_file,
path_to_test_file)``.
reader(:obj:`Reader`): A reader to read the files.
"""
return DatasetUserFolds(folds_files=folds_files, reader=reader)
@classmethod
def load_from_df(cls, df, reader):
"""Load a dataset from a pandas dataframe.
Use this if you want to use a custom dataset that is stored in a pandas
dataframe. See the :ref:`User Guide<load_from_df_example>` for an
example.
Args:
df(`Dataframe`): The dataframe containing the ratings. It must have
three columns, corresponding to the user (raw) ids, the item
(raw) ids, and the ratings, in this order.
reader(:obj:`Reader`): A reader to read the file. Only the
``rating_scale`` field needs to be specified.
"""
return DatasetAutoFolds(reader=reader, df=df)
def read_ratings(self, file_name):
"""Return a list of ratings (user, item, rating, timestamp) read from
file_name"""
with open(os.path.expanduser(file_name)) as f:
raw_ratings = [self.reader.parse_line(line) for line in
itertools.islice(f, self.reader.skip_lines, None)]
return raw_ratings
def folds(self):
"""Generator function to iterate over the folds of the Dataset.
See :ref:`User Guide <iterate_over_folds>` for usage.
Yields:
tuple: :class:`Trainset` and testset of current fold.
"""
for raw_trainset, raw_testset in self.raw_folds():
trainset = self.construct_trainset(raw_trainset)
testset = self.construct_testset(raw_testset)
yield trainset, testset
def construct_trainset(self, raw_trainset):
raw2inner_id_users = {}
raw2inner_id_items = {}
current_u_index = 0
current_i_index = 0
ur = defaultdict(list)
ir = defaultdict(list)
# user raw id, item raw id, translated rating, time stamp
for urid, irid, r, timestamp in raw_trainset:
try:
uid = raw2inner_id_users[urid]
except KeyError:
uid = current_u_index
raw2inner_id_users[urid] = current_u_index
current_u_index += 1
try:
iid = raw2inner_id_items[irid]
except KeyError:
iid = current_i_index
raw2inner_id_items[irid] = current_i_index
current_i_index += 1
ur[uid].append((iid, r))
ir[iid].append((uid, r))
n_users = len(ur) # number of users
n_items = len(ir) # number of items
n_ratings = len(raw_trainset)
trainset = Trainset(ur,
ir,
n_users,
n_items,
n_ratings,
self.reader.rating_scale,
self.reader.offset,
raw2inner_id_users,
raw2inner_id_items)
return trainset
def construct_testset(self, raw_testset):
return [(ruid, riid, r_ui_trans)
for (ruid, riid, r_ui_trans, _) in raw_testset]
class DatasetUserFolds(Dataset):
"""A derived class from :class:`Dataset` for which folds (for
cross-validation) are predefined."""
def __init__(self, folds_files=None, reader=None):
Dataset.__init__(self, reader)
self.folds_files = folds_files
# check that all files actually exist.
for train_test_files in self.folds_files:
for f in train_test_files:
if not os.path.isfile(os.path.expanduser(f)):
raise ValueError('File ' + str(f) + ' does not exist.')
def raw_folds(self):
for train_file, test_file in self.folds_files:
raw_train_ratings = self.read_ratings(train_file)
raw_test_ratings = self.read_ratings(test_file)
yield raw_train_ratings, raw_test_ratings
class DatasetAutoFolds(Dataset):
"""A derived class from :class:`Dataset` for which folds (for
cross-validation) are not predefined. (Or for when there are no folds at
all)."""
def __init__(self, ratings_file=None, reader=None, df=None):
Dataset.__init__(self, reader)
self.n_folds = 5
self.shuffle = True
if ratings_file is not None:
self.ratings_file = ratings_file
self.raw_ratings = self.read_ratings(self.ratings_file)
elif df is not None:
self.df = df
self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in
self.df.itertuples(index=False)]
else:
raise ValueError('Must specify ratings file or dataframe.')
def build_full_trainset(self):
"""Do not split the dataset into folds and just return a trainset as
is, built from the whole dataset.
User can then query for predictions, as shown in the :ref:`User Guide
<train_on_whole_trainset>`.
Returns:
The :class:`Trainset`.
"""
return self.construct_trainset(self.raw_ratings)
def raw_folds(self):
if self.shuffle:
random.shuffle(self.raw_ratings)
self.shuffle = False # set to false for future calls to raw_folds
def k_folds(seq, n_folds):
"""Inspired from scikit learn KFold method."""
if n_folds > len(seq) or n_folds < 2:
raise ValueError('Incorrect value for n_folds.')
start, stop = 0, 0
for fold_i in range(n_folds):
start = stop
stop += len(seq) // n_folds
if fold_i < len(seq) % n_folds:
stop += 1
yield seq[:start] + seq[stop:], seq[start:stop]
return k_folds(self.raw_ratings, self.n_folds)
def split(self, n_folds=5, shuffle=True):
"""Split the dataset into folds for future cross-validation.
If you forget to call :meth:`split`, the dataset will be automatically
shuffled and split for 5-folds cross-validation.
You can obtain repeatable splits over your all your experiments by
seeding the RNG: ::
import random
random.seed(my_seed) # call this before you call split!
Args:
n_folds(:obj:`int`): The number of folds.
shuffle(:obj:`bool`): Whether to shuffle ratings before splitting.
If ``False``, folds will always be the same each time the
experiment is run. Default is ``True``.
"""
self.n_folds = n_folds
self.shuffle = shuffle
class Reader():
"""The Reader class is used to parse a file containing ratings.
Such a file is assumed to specify only one rating per line, and each line
needs to respect the following structure: ::
user ; item ; rating ; [timestamp]
where the order of the fields and the separator (here ';') may be
arbitrarily defined (see below). brackets indicate that the timestamp
field is optional.
Args:
name(:obj:`string`, optional): If specified, a Reader for one of the
built-in datasets is returned and any other parameter is ignored.
Accepted values are 'ml-100k', 'ml-1m', and 'jester'. Default
is ``None``.
line_format(:obj:`string`): The fields names, in the order at which
they are encountered on a line. Default is ``'user item rating'``.
sep(char): the separator between fields. Example : ``';'``.
rating_scale(:obj:`tuple`, optional): The rating scale used for every
rating. Default is ``(1, 5)``.
skip_lines(:obj:`int`, optional): Number of lines to skip at the
beginning of the file. Default is ``0``.
"""
def __init__(self, name=None, line_format='user item rating', sep=None,
rating_scale=(1, 5), skip_lines=0):
if name:
try:
self.__init__(**BUILTIN_DATASETS[name].reader_params)
except KeyError:
raise ValueError('unknown reader ' + name +
'. Accepted values are ' +
', '.join(BUILTIN_DATASETS.keys()) + '.')
else:
self.sep = sep
self.skip_lines = skip_lines
self.rating_scale = rating_scale
lower_bound, higher_bound = rating_scale
self.offset = -lower_bound + 1 if lower_bound <= 0 else 0
splitted_format = line_format.split()
entities = ['user', 'item', 'rating']
if 'timestamp' in splitted_format:
self.with_timestamp = True
entities.append('timestamp')
else:
self.with_timestamp = False
# check that all fields are correct
if any(field not in entities for field in splitted_format):
raise ValueError('line_format parameter is incorrect.')
self.indexes = [splitted_format.index(entity) for entity in
entities]
def parse_line(self, line):
'''Parse a line.
Ratings are translated so that they are all strictly positive.
Args:
line(str): The line to parse
Returns:
tuple: User id, item id, rating and timestamp. The timestamp is set
to ``None`` if it does no exist.
'''
line = line.split(self.sep)
try:
if self.with_timestamp:
uid, iid, r, timestamp = (line[i].strip()
for i in self.indexes)
else:
uid, iid, r = (line[i].strip()
for i in self.indexes)
timestamp = None
except IndexError:
raise ValueError(('Impossible to parse line.' +
' Check the line_format and sep parameters.'))
return uid, iid, float(r) + self.offset, timestamp
class Trainset:
"""A trainset contains all useful data that constitutes a training set.
It is used by the :meth:`train()
<surprise.prediction_algorithms.algo_base.AlgoBase.train>` method of every
prediction algorithm. You should not try to built such an object on your
own but rather use the :meth:`Dataset.folds` method or the
:meth:`DatasetAutoFolds.build_full_trainset` method.
Attributes:
ur(:obj:`defaultdict` of :obj:`list`): The users ratings. This is a
dictionary containing lists of tuples of the form ``(item_inner_id,
rating)``. The keys are user inner ids.
ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a
dictionary containing lists of tuples of the form ``(user_inner_id,
rating)``. The keys are item inner ids.
n_users: Total number of users :math:`|U|`.
n_items: Total number of items :math:`|I|`.
n_ratings: Total number of ratings :math:`|R_{train}|`.
rating_scale(tuple): The minimum and maximal rating of the rating
scale.
global_mean: The mean of all ratings :math:`\\mu`.
"""
def __init__(self, ur, ir, n_users, n_items, n_ratings, rating_scale,
offset, raw2inner_id_users, raw2inner_id_items):
self.ur = ur
self.ir = ir
self.n_users = n_users
self.n_items = n_items
self.n_ratings = n_ratings
self.rating_scale = rating_scale
self.offset = offset
self._raw2inner_id_users = raw2inner_id_users
self._raw2inner_id_items = raw2inner_id_items
self._global_mean = None
# inner2raw dicts could be built right now (or even before) but they
# are not always useful so we wait until we need them.
self._inner2raw_id_users = None
self._inner2raw_id_items = None
def knows_user(self, uid):
"""Indicate if the user is part of the trainset.
A user is part of the trainset if the user has at least one rating.
Args:
uid(int): The (inner) user id. See :ref:`this
note<raw_inner_note>`.
Returns:
``True`` if user is part of the trainset, else ``False``.
"""
return uid in self.ur
def knows_item(self, iid):
"""Indicate if the item is part of the trainset.
An item is part of the trainset if the item was rated at least once.
Args:
iid(int): The (inner) item id. See :ref:`this
note<raw_inner_note>`.
Returns:
``True`` if item is part of the trainset, else ``False``.
"""
return iid in self.ir
def to_inner_uid(self, ruid):
"""Convert a **user** raw id to an inner id.
See :ref:`this note<raw_inner_note>`.
Args:
ruid(str): The user raw id.
Returns:
int: The user inner id.
Raises:
ValueError: When user is not part of the trainset.
"""
try:
return self._raw2inner_id_users[ruid]
except KeyError:
raise ValueError(('User ' + str(ruid) +
' is not part of the trainset.'))
def to_raw_uid(self, iuid):
"""Convert a **user** inner id to a raw id.
See :ref:`this note<raw_inner_note>`.
Args:
iuid(int): The user inner id.
Returns:
str: The user raw id.
Raises:
ValueError: When ``iuid`` is not an inner id.
"""
if self._inner2raw_id_users is None:
self._inner2raw_id_users = {inner: raw for (raw, inner) in
iteritems(self._raw2inner_id_users)}
try:
return self._inner2raw_id_users[iuid]
except KeyError:
raise ValueError((str(iuid) +
' is not a valid inner id.'))
def to_inner_iid(self, riid):
"""Convert an **item** raw id to an inner id.
See :ref:`this note<raw_inner_note>`.
Args:
riid(str): The item raw id.
Returns:
int: The item inner id.
Raises:
ValueError: When item is not part of the trainset.
"""
try:
return self._raw2inner_id_items[riid]
except KeyError:
raise ValueError(('Item ' + str(riid) +
' is not part of the trainset.'))
def to_raw_iid(self, iiid):
"""Convert an **item** inner id to a raw id.
See :ref:`this note<raw_inner_note>`.
Args:
iiid(int): The item inner id.
Returns:
str: The item raw id.
Raises:
ValueError: When ``iiid`` is not an inner id.
"""
if self._inner2raw_id_items is None:
self._inner2raw_id_items = {inner: raw for (raw, inner) in
iteritems(self._raw2inner_id_items)}
try:
return self._inner2raw_id_items[iiid]
except KeyError:
raise ValueError((str(iiid) +
' is not a valid inner id.'))
def all_ratings(self):
"""Generator function to iterate over all ratings.
Yields:
A tuple ``(uid, iid, rating)`` where ids are inner ids (see
:ref:`this note <raw_inner_note>`).
"""
for u, u_ratings in iteritems(self.ur):
for i, r in u_ratings:
yield u, i, r
def build_testset(self):
"""Return a list of ratings that can be used as a testset in the
:meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
method.
The ratings are all the ratings that are in the trainset, i.e. all the
ratings returned by the :meth:`all_ratings()
<surprise.dataset.Trainset.all_ratings>` generator. This is useful in
cases where you want to to test your algorithm on the trainset.
"""
return [(self.to_raw_uid(u), self.to_raw_iid(i), r)
for (u, i, r) in self.all_ratings()]
def build_anti_testset(self):
"""Return a list of ratings that can be used as a testset in the
:meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
method.
The ratings are all the ratings that are **not** in the trainset, i.e.
all the ratings :math:`r_{ui}` where the user :math:`u` is known, the
item :math:`i` is known, but the rating :math:`r_{ui}` is not in the
trainset. As :math:`r_{ui}` is unknown, it is assumed to be equal to
the mean of all ratings :meth:`global_mean
<surprise.dataset.Trainset.global_mean>`.
"""
anti_testset = []
for u in self.all_users():
for i in self.all_items():
user_items = [j for (j, _) in self.ur[u]]
if i not in user_items:
r_ui = (self.to_raw_uid(u), self.to_raw_iid(i),
self.global_mean)
anti_testset.append(r_ui)
return anti_testset
def all_users(self):
"""Generator function to iterate over all users.
Yields:
Inner id of users.
"""
return range(self.n_users)
def all_items(self):
"""Generator function to iterate over all items.
Yields:
Inner id of items.
"""
return range(self.n_items)
@property
def global_mean(self):
"""Return the mean of all ratings.
It's only computed once."""
if self._global_mean is None:
self._global_mean = np.mean([r for (_, _, r) in
self.all_ratings()])
return self._global_mean