MovieLens 데이터 로드 

In [6]:
import os
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import dump

data = Dataset.load_builtin('ml-100k')

In [7]:
df = pd.DataFrame(data.raw_ratings, columns=["user","item","rate","id"])

In [8]:
df.head()

Unnamed: 0,user,item,rate,id
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


트레이닝, 모델 저장

In [5]:
import os

from surprise import SVD
from surprise import Dataset
from surprise import dump

data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

algo = SVD()
algo.train(trainset)

# Compute predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.expanduser('dump_file')
dump.dump(file_name, algo=algo)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')



Predictions are the same


정확도 계산 

In [9]:
from surprise import Dataset
from surprise import SVD
from surprise import accuracy

data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.train(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)

accuracy.rmse(predictions)

RMSE: 0.6752


0.6752274273892549

모델 최적화(파라미터 튜닝)

In [11]:
import random

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import GridSearch

# 데이터 로드하기
# 3개의 부분 집합으로 나눠 교차 검증(k-fold)
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

# Grid search로 최적 알고리즘 찾기.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=1)
print(grid_search.best_params)
grid_search.evaluate(data)

Grid Search...
CaseInsensitiveDefaultDict(<class 'list'>, {})
Running grid search for the following parameter combinations:
{'n_epochs': 5, 'lr_all': 0.002}
{'n_epochs': 5, 'lr_all': 0.005}
{'n_epochs': 10, 'lr_all': 0.002}
{'n_epochs': 10, 'lr_all': 0.005}




Resulsts:
{'n_epochs': 5, 'lr_all': 0.002}
{'RMSE': 0.989731535067378}
----------
{'n_epochs': 5, 'lr_all': 0.005}
{'RMSE': 0.9643148384062585}
----------
{'n_epochs': 10, 'lr_all': 0.002}
{'RMSE': 0.9692767778991715}
----------
{'n_epochs': 10, 'lr_all': 0.005}
{'RMSE': 0.9529537540782719}
----------


사용자 별 영화 추천 Example
offline 방식으로 별도 테이블을 만들어 저장하고, 해당 사용자 로그인 시 웹에서 추천 시나리오

In [12]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
        
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        
        return top_n
    
    # First train an SVD algorithm on the movielens dataset.
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.train(trainset)
    
    # Then predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    
    top_n = get_top_n(predictions, n=10)
    
    # Print the recommended items for each user
    for uid, user_ratings in top_n.items():
        print(uid, [iid for (iid, _) in user_ratings])


추천성능 평가

알고리즘 평가

In [14]:
import surprise
from surprise import Dataset
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

In [15]:
sim_options = {'name': 'msd'}
algo = surprise.KNNBasic(sim_options=sim_options)
surprise.evaluate(algo, data)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9896
MAE:  0.7831
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9867
MAE:  0.7795
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9880
MAE:  0.7807
------------
------------
Mean RMSE: 0.9881
Mean MAE : 0.7811
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9896338617035874,
                             0.9866790265303086,
                             0.9880418347472695],
                            'mae': [0.7831036204455019,
                             0.779546709170688,
                             0.7807400035336455]})

In [20]:
sim_options = {'name': 'cosine'}
algo = surprise.KNNBasic(sim_options=sim_options)
surprise.evaluate(algo, data)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0227
MAE:  0.8099
------------
Fold 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0195
MAE:  0.8072
------------
Fold 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0203
MAE:  0.8079
------------
------------
Mean RMSE: 1.0209
Mean MAE : 0.8083
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.022717950802798,
                             1.0194892988458732,
                             1.020346318457278],
                            'mae': [0.809912983482156,
                             0.8072111900513804,
                             0.8078611171740263]})

In [17]:
sim_options = {'name': 'pearson'}
algo = surprise.KNNBasic(sim_options=sim_options)
surprise.evaluate(algo, data)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0215
MAE:  0.8106
------------
Fold 2
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0192
MAE:  0.8082
------------
Fold 3
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0186
MAE:  0.8087
------------
------------
Mean RMSE: 1.0198
Mean MAE : 0.8092
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.0214506053163852,
                             1.0192446327060645,
                             1.0186217134648394],
                            'mae': [0.8106213588378686,
                             0.8082302671066413,
                             0.808707250563115]})

In [18]:
sim_options = {'name': 'pearson_baseline'}
algo = surprise.KNNBasic(sim_options=sim_options)
surprise.evaluate(algo, data)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0117
MAE:  0.8001
------------
Fold 2
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0108
MAE:  0.7994
------------
Fold 3
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0096
MAE:  0.7986
------------
------------
Mean RMSE: 1.0107
Mean MAE : 0.7994
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.0117155223959369,
                             1.0108372909816428,
                             1.0095889351167122],
                            'mae': [0.8001317580693539,
                             0.7993945782223404,
                             0.7985583943957283]})

In [19]:
algo = surprise.SVD()
surprise.evaluate(algo, data)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9431
MAE:  0.7457
------------
Fold 2
RMSE: 0.9465
MAE:  0.7475
------------
Fold 3
RMSE: 0.9485
MAE:  0.7478
------------
------------
Mean RMSE: 0.9460
Mean MAE : 0.7470
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9431093902461771,
                             0.9465285946146799,
                             0.9484967067801241],
                            'mae': [0.7456869100862563,
                             0.7475045681730982,
                             0.7477718451391853]})