In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# 데이터 불러오기
data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9423  0.9397  0.9369  0.9300  0.9377  0.9373  0.0041  
MAE (testset)     0.7409  0.7424  0.7389  0.7317  0.7404  0.7389  0.0038  
Fit time          10.69   10.10   10.65   10.83   10.60   10.58   0.25    
Test time         0.39    0.35    0.32    0.34    0.30    0.34    0.03    


{'test_rmse': array([0.94226779, 0.93968873, 0.93685693, 0.93004769, 0.93774583]),
 'test_mae': array([0.74092175, 0.74242735, 0.73894886, 0.73165335, 0.74037709]),
 'fit_time': (10.69029712677002,
  10.10060429573059,
  10.65111517906189,
  10.834086418151855,
  10.600472211837769),
 'test_time': (0.3923370838165283,
  0.34967613220214844,
  0.3150451183319092,
  0.33876800537109375,
  0.29660820960998535)}

In [2]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9436


0.9436260979584182

In [3]:
predictions = algo.fit(trainset).test(testset)

In [4]:
from surprise import KNNBasic
from surprise import Dataset

data = Dataset.load_builtin('ml-100k')

trainset = data.build_full_trainset()

algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1e6a84b2048>

In [5]:
uid = str(196) # raw user id (as in the ratings file).
iid = str(302) # raw item id (as in the ratings file). 

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


In [7]:
import os
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# path to dataset file
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')

# As we're loading a custom dataset, we need to define a reader.
reader = Reader(line_format='user item rating timestamp', sep='\t')

data = Dataset.load_from_file(file_path, reader=reader)

cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9447  0.9368  0.9502  0.9432  0.9461  0.9442  0.0044  
MAE (testset)     0.7493  0.7426  0.7520  0.7483  0.7499  0.7484  0.0031  
Fit time          0.33    0.33    0.34    0.36    0.34    0.34    0.01    
Test time         0.21    0.29    0.30    0.21    0.28    0.26    0.04    


{'test_rmse': array([0.94466375, 0.93684402, 0.950226  , 0.94317809, 0.94614659]),
 'test_mae': array([0.74930371, 0.74261914, 0.75201811, 0.74828497, 0.74989178]),
 'fit_time': (0.33083415031433105,
  0.33434057235717773,
  0.3365015983581543,
  0.3625752925872803,
  0.3367500305175781),
 'test_time': (0.2148122787475586,
  0.2861621379852295,
  0.3018968105316162,
  0.21378350257873535,
  0.27929234504699707)}

In [9]:
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

ratings_dict = {'itemID': [1, 1, 1, 2, 2],
               'userID': [9, 32, 2, 45, 'user_foo'],
               'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale=(1,5))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([1.98763616, 1.95012775]),
 'test_mae': array([1.73891565, 1.83786776]),
 'fit_time': (0.0005283355712890625, 0.0),
 'test_time': (0.0, 0.0005023479461669922)}

In [10]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold

data = Dataset.load_builtin('ml-100k')

kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9477
RMSE: 0.9472
RMSE: 0.9435


In [11]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9521
RMSE: 0.9378
RMSE: 0.9327
RMSE: 0.9331
RMSE: 0.9336


In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Use movielens-100K
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
results_df = pd.DataFrame.from_dict(gs.cv_results)