In [2]:
%cd /content/drive/MyDrive/multi/0429

/content/drive/MyDrive/multi/0429


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 잠재요인 협업 필터링 - SVD 활용

In [5]:
import pandas as pd
from google.colab import files

In [6]:
# 사용자수, 영화수
ratings = pd.read_csv('./data/ratings.csv')
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [None]:
!pip install scikit-surprise

In [8]:
from surprise import SVD, Reader
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5,5))
data_folds = DatasetAutoFolds('./data/ratings_noh.csv', reader=reader)

In [9]:
# 전체 데이터를 학습 데이터로 사용

trainset = data_folds.build_full_trainset()
type(trainset)

surprise.trainset.Trainset

In [10]:
# 모델 생성 및 학습

model = SVD(n_epochs=20, n_factors=50, random_state=2022)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f435ad79610>

- 사용자 Id: 9, 영화 Id: 42 (Dead Presidents (1995))

In [11]:
mdf = pd.read_csv('./data/movies.csv')
mdf.head(40).tail(5)

Unnamed: 0,movieId,title,genres
35,39,Clueless (1995),Comedy|Romance
36,40,"Cry, the Beloved Country (1995)",Drama
37,41,Richard III (1995),Drama|War
38,42,Dead Presidents (1995),Action|Crime|Drama
39,43,Restoration (1995),Drama


In [12]:
movieIds = ratings[ratings.userId == 9]['movieId']
movieIds[movieIds == 42].count()

0

In [14]:
# 사용자 9번의 42번 영화에 대한 예상 평점
uid = str(9); mid = str(42)
pred = model.predict(uid, mid, verbose=True)

user: 9          item: 42         r_ui = None   est = 3.25   {'was_impossible': False}


In [15]:
pred

Prediction(uid='9', iid='42', r_ui=None, est=3.249924377339538, details={'was_impossible': False})

- 사용자 9번이 보지 않은 영화 중에서 예상 점수가 가장 높은 top 10

In [16]:
seen_movies = ratings[ratings['userId'] == 9]['movieId'].tolist()
total_movies = mdf.movieId.tolist()
unseen_movies = [movie for movie in total_movies if movie not in seen_movies]

In [17]:
len(seen_movies), len(unseen_movies)

(46, 9696)

In [41]:
predictions = [model.predict(uid, str(mid)) for mid in unseen_movies]

In [43]:
def sortkey_est(pred):
    return pred.est

In [44]:
#예상 평점이 가장 높은 상위 10개 영화 목록 추출
predictions.sort(key=sortkey_est, reverse=True)
top_movies_ids = [int(pred.iid) for pred in predictions[:10]]
top_movies_ids

[318, 1217, 1261, 1204, 3275, 4973, 142488, 1673, 50, 898]

In [55]:
ratings[ratings['userId'] == 9]['rating']

1073    3.0
1074    3.0
1075    4.0
1076    3.0
1077    3.0
1078    4.0
1079    5.0
1080    2.0
1081    4.0
1082    5.0
1083    5.0
1084    4.0
1085    2.0
1086    3.0
1087    4.0
1088    3.0
1089    5.0
1090    3.0
1091    2.0
1092    2.0
1093    3.0
1094    3.0
1095    1.0
1096    1.0
1097    5.0
1098    4.0
1099    3.0
1100    4.0
1101    4.0
1102    4.0
1103    5.0
1104    1.0
1105    1.0
1106    3.0
1107    3.0
1108    4.0
1109    1.0
1110    4.0
1111    5.0
1112    5.0
1113    4.0
1114    1.0
1115    4.0
1116    2.0
1117    4.0
1118    2.0
Name: rating, dtype: float64

In [45]:
top_movies_ratings = [pred.est for pred in predictions[:10]]
top_movies_ratings

[4.070330794979969,
 4.063731956995097,
 4.051908410348554,
 4.0227662213503805,
 4.011500870494226,
 3.9996958730949137,
 3.9853484799628194,
 3.9798846903676015,
 3.9786251266485744,
 3.978415496006661]

In [49]:
mdf.movieId.isin([1,2])

0        True
1        True
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Name: movieId, Length: 9742, dtype: bool

In [64]:
# top_movies_ids 안의 값의 경우 True가 된다.

top_movies_titles = [mdf[mdf['movieId'] == int(pred.iid)]['title'] for pred in predictions[:10]]

918    Ran (1985)
Name: title, dtype: object

In [81]:
mdf['title']

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

In [87]:
import numpy as np

In [90]:
top_df = pd.DataFrame({
    '영화명' : top_movies_titles,
    '예상평점': top_movies_ratings,
})

top_df.set_index(np.arange(1,11), inplace=True, drop=True)
top_df.index.name = 'rank'
top_df

Unnamed: 0_level_0,영화명,예상평점
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"277 Shawshank Redemption, The (1994) Name: ...",4.070331
2,"918 Ran (1985) Name: title, dtype: object",4.063732
3,960 Evil Dead II (Dead by Dawn) (1987) Name...,4.051908
4,"906 Lawrence of Arabia (1962) Name: title, ...",4.022766
5,"2462 Boondock Saints, The (2000) Name: titl...",4.011501
6,3622 Amelie (Fabuleux destin d'Amélie Poula...,3.999696
7,"9071 Spotlight (2015) Name: title, dtype: o...",3.985348
8,"1258 Boogie Nights (1997) Name: title, dtyp...",3.979885
9,"46 Usual Suspects, The (1995) Name: title, ...",3.978625
10,"680 Philadelphia Story, The (1940) Name: ti...",3.978415
