In [1]:
import surprise
surprise.__version__

'1.1.2'

In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [4]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
ratings['rating'].min()

0.5

In [6]:
ratings['rating'].max()

5.0

In [7]:
reader = Reader(rating_scale=(0.5,5))

In [8]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x240722e6b80>

In [9]:
svd = SVD(random_state=0)

In [10]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8930  0.8921  0.8996  0.8964  0.9002  0.8963  0.0033  
MAE (testset)     0.6878  0.6863  0.6920  0.6886  0.6938  0.6897  0.0028  
Fit time          1.56    1.85    1.84    1.81    1.61    1.73    0.12    
Test time         0.53    0.39    0.46    0.27    0.29    0.39    0.10    


{'test_rmse': array([0.89300653, 0.89209454, 0.89958527, 0.89643309, 0.90023682]),
 'test_mae': array([0.68777432, 0.6863112 , 0.69202785, 0.68862086, 0.69377331]),
 'fit_time': (1.5600671768188477,
  1.852130651473999,
  1.8381879329681396,
  1.8134446144104004,
  1.6096959114074707),
 'test_time': (0.533043384552002,
  0.38974452018737793,
  0.4632248878479004,
  0.2686581611633301,
  0.28885912895202637)}

교차 검증 (K-Fold 교차 검증)

100개 데이터

A : 1-20
B : 21-40,
C : 41-60
D : 61-80
E : 81-100

ABCD (train set), E (test set)
ABCE (train set), D (test set)
ABDE (train set), C (test set)
ACDE (train set), B (test set)
BCDE (train set), A (test set)

In [11]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x240722e6be0>

In [12]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [13]:
svd.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.7142061734434044, details={'was_impossible': False})

In [14]:
svd.predict(1, 1029, 3) # userId = 1번인 사람이 movieId = 1029인 영화에 대해서 실제 평가가 3점일 때, 예측 평가점수 ?

Prediction(uid=1, iid=1029, r_ui=3, est=2.8814455446761933, details={'was_impossible': False})

In [15]:
ratings[ratings['userId'] == 100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [16]:
svd.predict(100, 1029) # userId = 100, movieId = 1029

Prediction(uid=100, iid=1029, r_ui=None, est=3.7705476478414846, details={'was_impossible': False})

### 응용
- 이미 시청한 영화와 아직 시청하지 않은 영화를 구분을 짓고
- 이미 시청한 영화에 대한 평가 데이터를 만들어서 기존 데이터셋인 ratings에 추가하고 학습을 시킨 후
- 아직 시청하지 않은 영화에 대해 모두 평점을 예측해보고 그 중에서 가장 평점이 높은 Top 10의 영화를 추천해주는 시스템 개발 가능