In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import surprise

In [19]:
# 데이터 준비

from surprise import Reader, Dataset # 데이터를 읽을 때 읽는 방법을 설정하는 클래스

movies_small = pd.read_csv('data-files/ml-latest-small/movies.csv')
ratings_small = pd.read_csv('data-files/ml-latest-small/ratings.csv')
data = Dataset.load_from_df(ratings_small[['userId', 'movieId', 'rating']],
                            Reader(rating_scale=(0.5, 5)))

In [4]:
trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [7]:
len(testset), testset[0]

(100836, (1, 1, 4.0))

In [9]:
# 모델 훈련 (학습)
from surprise import SVD

svd = SVD(n_factors=100, n_epochs=20, random_state=42)

svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19470722ae0>

In [None]:
# 26번 사용자가 시청하지 않은 영화 평점 예측

print( ratings_small[ratings_small['userId'] == 26].shape )

uid_mask = ratings_small['userId'] == 26
ratings_small[uid_mask][["userId", "movieId"]] # 26번 사용자가 평점 부여한 영화 조회


In [16]:
def select_unrated_movies(ratings, user_id):
    all_movie_id = ratings['movieId'].unique() # 모든 영화 id
    uid_mask = ratings_small['userId'] == user_id
    rated_movie_ids = ratings[uid_mask]['movieId'].values # 평가한 영화 id
    unrated_movies = [ mid for mid in all_movie_id if mid not in rated_movie_ids ]
    return unrated_movies


In [21]:
print( select_unrated_movies(ratings_small, 26)[:10] )
movies_small[movies_small['movieId'] == 1]

[1, 3, 6, 50, 70, 101, 110, 151, 157, 163]


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [22]:
svd.predict(26, 1)

Prediction(uid=26, iid=1, r_ui=None, est=3.7843049838259457, details={'was_impossible': False})

In [66]:
movies_small.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [75]:
def get_movie_title(movies, movie_id):
    movie_id_mask = movies['movieId'] == movie_id
    return movies[movie_id_mask]['title'].values[0]

def recommend_movies(ratings, movies, user_id, top_n=10):
    unrated_movie_ids = select_unrated_movies(ratings, user_id)

    predictions = [ svd.predict(user_id, movie_id) for movie_id in unrated_movie_ids ]
    predictions.sort(key=lambda p: p.est, reverse=True) # 예상평점기준 내림차순 정렬

    top_n_predictions = predictions[:top_n]
    recommendations = [ (p.iid, get_movie_title(movies, p.iid)) for p in top_n_predictions ]

    return recommendations


In [76]:
recommend_movies(ratings_small, movies_small, 26)

[(1148, 'Wallace & Gromit: The Wrong Trousers (1993)'),
 (318, 'Shawshank Redemption, The (1994)'),
 (858, 'Godfather, The (1972)'),
 (1204, 'Lawrence of Arabia (1962)'),
 (1262, 'Great Escape, The (1963)'),
 (56782, 'There Will Be Blood (2007)'),
 (58559, 'Dark Knight, The (2008)'),
 (2959, 'Fight Club (1999)'),
 (1213, 'Goodfellas (1990)'),
 (750,
  'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)')]

In [80]:
uid_mask = ratings_small['userId'] == 26
rated_movie_ids = ratings_small[uid_mask]["movieId"] # 26번 사용자가 평점 부여한 영화 조회
[ movies_small[movies_small["movieId"] == movie_id]["title"] for movie_id in rated_movie_ids ]


[9    GoldenEye (1995)
 Name: title, dtype: object,
 32    Babe (1995)
 Name: title, dtype: object,
 43    Seven (a.k.a. Se7en) (1995)
 Name: title, dtype: object,
 123    Apollo 13 (1995)
 Name: title, dtype: object,
 126    Batman Forever (1995)
 Name: title, dtype: object,
 138    Die Hard: With a Vengeance (1995)
 Name: title, dtype: object,
 156    Net, The (1995)
 Name: title, dtype: object,
 176    Waterworld (1995)
 Name: title, dtype: object,
 192    Disclosure (1994)
 Name: title, dtype: object,
 249    Natural Born Killers (1994)
 Name: title, dtype: object,
 257    Pulp Fiction (1994)
 Name: title, dtype: object,
 260    Quiz Show (1994)
 Name: title, dtype: object,
 302    Ace Ventura: Pet Detective (1994)
 Name: title, dtype: object,
 307    Clear and Present Danger (1994)
 Name: title, dtype: object,
 314    Forrest Gump (1994)
 Name: title, dtype: object,
 337    True Lies (1994)
 Name: title, dtype: object,
 378    Cliffhanger (1993)
 Name: title, dtype: object,
 395  