In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# !conda install -c conda-forge scikit-surprise

In [2]:
import surprise

print(surprise.__version__)

1.1.3


In [3]:
# 데이터 준비 1. 
from surprise import Dataset

data = Dataset.load_builtin("ml-100k")

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\Administrator.User -2023YNCQT/.surprise_data/ml-100k


In [7]:
# 데이터 준비 2.

from surprise import Reader

ratings_small = pd.read_csv('data-files/ml-latest-small/ratings.csv')
data2 = Dataset.load_from_df(ratings_small[["userId", "movieId", "rating"]],
                             Reader(rating_scale=(0.5, 5.0)))

In [8]:
# 데이터 준비 3.

from surprise import Reader

data3 = Dataset.load_from_file("data-files/ml-latest-small/ratings.csv",
                               Reader(rating_scale=(0.5, 5.0), sep=",", skip_lines=1))

In [9]:
# 훈련 데이터 / 테스트 데이터 분할
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data3, test_size=0.2, random_state=22)

In [10]:
# 모델 생성
from surprise import KNNBasic

knn_model = KNNBasic(sim_options={ "name":"cosine", "user_based":False })

In [18]:
knn_model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x258fbc116d0>

In [19]:
testset[:3]

[('378', '33794', 4.5), ('89', '6686', 5.0), ('264', '5816', 4.0)]

In [21]:
# 단일 값 예측
knn_model.predict('378', '33794', 4.5) 

Prediction(uid='378', iid='33794', r_ui=4.5, est=3.923271771308787, details={'actual_k': 39, 'was_impossible': False})

In [63]:
# 여러 값 예측
predictions = knn_model.test(testset)
predictions[:3]

[Prediction(uid='378', iid='33794', r_ui=4.5, est=3.923271771308787, details={'actual_k': 39, 'was_impossible': False}),
 Prediction(uid='89', iid='6686', r_ui=5.0, est=2.85, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='264', iid='5816', r_ui=4.0, est=3.396079115385866, details={'actual_k': 39, 'was_impossible': False})]

In [64]:
# for p in predictions[:5]:
#     print( p.uid, p.iid, p.est )

d = []
for p in predictions[:5]:
    d.append( (p.r_ui, p.est ))
d

[ (p.uid, p.iid, p.est) for p in predictions[:10] ] # 위 구문과 동일한 결과

[('378', '33794', 3.923271771308787),
 ('89', '6686', 2.85),
 ('264', '5816', 3.396079115385866),
 ('202', '2407', 3.599858988746179),
 ('414', '63082', 2.95),
 ('517', '1848', 2.3625),
 ('216', '616', 3.499795086036361),
 ('509', '57504', 3.0375),
 ('512', '480', 3.7681562751102566),
 ('202', '780', 3.6481042080907047)]

In [38]:
d = np.array([ (p.r_ui, p.est) for p in predictions])
d.shape

(20168, 2)

In [32]:
d[:10]

array([[4.5       , 3.92327177],
       [5.        , 2.85      ],
       [4.        , 3.39607912],
       [4.        , 3.59985899],
       [4.        , 2.95      ],
       [3.        , 2.3625    ],
       [4.        , 3.49979509],
       [3.5       , 3.0375    ],
       [4.        , 3.76815628],
       [2.        , 3.64810421]])

In [33]:
from sklearn.metrics import mean_absolute_error, mean_squared_error


mean_absolute_error(d[:, 0], d[:, 1]), mean_squared_error(d[:, 0], d[:, 1]), \
    np.sqrt(mean_squared_error(d[:, 0], d[:, 1]))

(0.7551751081911018, 0.9470712593780297, 0.9731758625130557)

In [39]:
from surprise import accuracy

print( accuracy.mae(predictions=predictions) )
print( accuracy.mse(predictions=predictions) )
print( accuracy.rmse(predictions=predictions) )

MAE:  0.7552
0.7551751081911018
MSE: 0.9471
0.9470712593780297
RMSE: 0.9732
0.9731758625130557


array([     1,      3,      6, ..., 160836, 163937, 163981], dtype=int64)

In [72]:
# 시청하지 않은 영화 찾아서 예상 평점 산출
all_movie_id = ratings_small["movieId"].unique()
rated_movies = ratings_small[ratings_small["userId"] == 42]["movieId"].values
# not_rated_movies = [movie_id for movie_id in all_movie_id if movie_id not in rated_movies]

predictions = []
for movie_id in all_movie_id[:10]:
    if movie_id not in rated_movies:
        prediction = knn_model.predict("42", str(movie_id))
        predictions.append(prediction)

In [73]:
len(predictions)
predictions.sort(key=lambda p: p.est, reverse=True) # est 기준 내림차순 정렬

top_10_predictions = predictions[:10]
top_10_predictions

[Prediction(uid='42', iid='151', r_ui=None, est=3.45, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='101', r_ui=None, est=3.325, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='70', r_ui=None, est=3.249766939947328, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='1', r_ui=None, est=3.248958852771118, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='157', r_ui=None, est=3.075, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='42', iid='6', r_ui=None, est=2.8248002270983084, details={'actual_k': 40, 'was_impossible': False})]

In [87]:
movies = pd.read_csv("data-files/ml-latest-small/movies.csv")
top_10_titles = []
for p in predictions:
    movie = movies[movies['movieId'] == int(p.iid)]
    # print(movie["title"])
    top_10_titles.append(movie['title'].values[0])

print(top_10_titles)

    

['Rob Roy (1995)', 'Bottle Rocket (1996)', 'From Dusk Till Dawn (1996)', 'Toy Story (1995)', 'Canadian Bacon (1995)', 'Heat (1995)']
