In [191]:
import os
import pandas as pd


rating_file_path = os.getenv('HOME') +'/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep = '::', names = ratings_cols, engine = 'python')
original_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [192]:
#3점 이상만 남깁니다

ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'original_data_size: {original_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / original_data_size:.2%}')

original_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [193]:
# rating 칼럼의 이름을 count 로 바꿉니다.

ratings.rename(columns = {'rating':'count'}, inplace = True)

In [194]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.

movie_file_path = os.getenv('HOME') +'/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv(movie_file_path, sep = '::', names = cols, engine = 'python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 1. 분석하기

In [195]:
movies['title'] = movies['title'].str.lower()

#1 영화 개수 : 3883

movies['title'].nunique()

3883

In [196]:
#2 유니크한 사용자수 : 6039
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [197]:
ratings['user_id'].nunique()

6039

In [198]:
#3 가장 인기있는 영화 30개(인기순)

movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending= False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

# 2. 내가 선호하는 영화를 5가지 골라서 rating에 추가해 줍시다.

In [199]:
my_favorite = ['matrix, the (1999)', 'once upon a time in america (1984)', 'terminator 2: judgement day (1991)',
              'Shawshank Redemption, The (1994)', 'braveheart (1995)']

my_movielist = pd.DataFrame({'user_id': ['kyuhwan']*5,  'count':[10]*5,'timestamp' : [12345678]*5,
                             'movie_id':[2571,1227,589,318,110] })

if not ratings.isin({'user_id': ['kyuhwan']})['user_id'].any():
    ratings = ratings.append(my_movielist)
ratings.tail(10)


Unnamed: 0,user_id,movie_id,count,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,kyuhwan,2571,10,12345678
1,kyuhwan,1227,10,12345678
2,kyuhwan,589,10,12345678
3,kyuhwan,318,10,12345678
4,kyuhwan,110,10,12345678


In [200]:
merge = pd.merge(ratings, movies)
merge


Unnamed: 0,user_id,movie_id,count,timestamp,title,genre
0,1,1193,5,978300760,one flew over the cuckoo's nest (1975),Drama
1,2,1193,5,978298413,one flew over the cuckoo's nest (1975),Drama
2,12,1193,4,978220179,one flew over the cuckoo's nest (1975),Drama
3,15,1193,4,978199279,one flew over the cuckoo's nest (1975),Drama
4,17,1193,5,978158471,one flew over the cuckoo's nest (1975),Drama
...,...,...,...,...,...,...
836478,5851,3607,5,957756608,one little indian (1973),Comedy|Drama|Western
836479,5854,3026,4,958346883,slaughterhouse (1987),Horror
836480,5854,690,3,957744257,"promise, the (versprechen, das) (1994)",Romance
836481,5938,2909,4,957273353,"five wives, three secretaries and me (1998)",Documentary


In [201]:
# 고유한 유저, 타이틀, 장르를 찾아내는 코드
user_unique = merge['user_id'].unique()
title_unique = merge['title'].unique()
genre_unique = merge['genre'].unique()

# 유저, 타이틀, 장르를 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}
genre_to_idx = {v:k for k,v in enumerate(genre_unique)}

print(user_to_idx['kyuhwan'])

5965


# 3. CSR matrix를 직접 만들어 봅시다.

In [202]:

# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 

temp_user_data = merge['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(merge):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    merge['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# title_to_idx을 통해 title 컬럼도 동일한 방식으로 인덱싱해 줍니다.

temp_title_data = merge['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(merge):
    print('title column indexing OK!!')
    merge['title'] = temp_title_data
else:
    print('title column indexing Fail!!')
    
temp_genre_data = merge['genre'].map(genre_to_idx.get).dropna()
if len(temp_genre_data) == len(merge):
    print('genre column indexing OK!')
    merge['genre'] = temp_genre_data
else:
    print('genre column indexing Fail!')

merge

user_id column indexing OK!!
title column indexing OK!!
genre column indexing OK!


Unnamed: 0,user_id,movie_id,count,timestamp,title,genre
0,0,1193,5,978300760,0,0
1,1,1193,5,978298413,0,0
2,2,1193,4,978220179,0,0
3,3,1193,4,978199279,0,0
4,4,1193,5,978158471,0,0
...,...,...,...,...,...,...
836478,1621,3607,5,957756608,3623,36
836479,3481,3026,4,958346883,3624,95
836480,3481,690,3,957744257,3625,15
836481,4159,2909,4,957273353,3626,89


In [203]:
#CSR MATRIX 만들기
from scipy.sparse import csr_matrix

num_user = merge['user_id'].nunique()
num_title = merge['title'].nunique()

csr_data = csr_matrix((merge['count'], (merge.user_id, merge.title)), shape= (num_user, num_title))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# 4. als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [204]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

#implict 에서 권장하고 있는 부분
os.environ['OPENBLAS_NUM_THREAD'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['MKL_NUM_THREADS'] = '1'

#implict AlternatingLeastSquares 모델의 선언

als_model = AlternatingLeastSquares(factors = 100, regularization = 0.01, use_gpu = False, iterations = 15,
                                   dtype = np.float32)

#als 모델은 input 으로 (item X user 꼴의 matrix 를 받기 때문에 Transpose 해줍니다.)

csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [205]:
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [206]:
print(user_to_idx['kyuhwan'])

5965


#  5. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요

In [207]:
kyuhwan, matrix  = user_to_idx['kyuhwan'], title_to_idx['matrix, the (1999)']
kyuhwan_vector, matrix_vector = als_model.user_factors[kyuhwan], als_model.item_factors[matrix]

In [208]:
kyuhwan_vector

array([-0.00437762,  0.949361  ,  0.40790305,  0.14067484, -0.74274564,
       -0.10172333, -0.3641096 ,  0.61458755,  2.07673   , -0.90981364,
       -0.18393911, -0.6301401 ,  1.1868212 , -0.2279891 , -0.5509618 ,
       -1.1676133 , -0.7594247 , -1.2146854 , -0.66797924,  0.0207701 ,
        0.85475886, -0.2372733 ,  0.06479118,  0.11871891,  0.5231868 ,
       -0.9814384 , -0.47294697, -0.9890063 , -0.5450579 , -0.40299174,
       -0.35649768,  0.0687605 , -0.10442121,  0.5412049 , -0.13442527,
       -0.23702298, -0.513793  , -0.28447077,  0.251159  , -1.11548   ,
       -0.8584993 ,  0.5684495 ,  0.5344547 ,  0.86717385, -0.16623397,
        0.26408276,  0.43226445,  0.26782557, -0.47137794, -0.41869628,
       -1.0722643 ,  0.71250564, -0.36554563,  0.17761685, -0.24887073,
        1.209872  ,  1.0677354 ,  0.12913945,  0.51737815,  1.077268  ,
        0.56298673, -0.22405262, -0.2760566 ,  0.11628606,  0.38543284,
        0.10009061,  0.3158895 , -1.0774065 ,  0.12341387, -0.94

In [209]:
matrix_vector

array([ 0.00540403,  0.01382758, -0.00817634, -0.0078498 , -0.00684362,
        0.00309747, -0.02185092,  0.02447331,  0.02955781, -0.01191077,
       -0.0022155 , -0.00499668,  0.02299614, -0.00680788, -0.01508094,
       -0.01379099,  0.00487817, -0.01387457, -0.00967547,  0.00402016,
        0.04432318,  0.02069758, -0.02423109,  0.0234874 ,  0.01670091,
       -0.03743084, -0.01061292,  0.01556541,  0.00638414, -0.00646568,
       -0.00576269, -0.01634603, -0.00278872,  0.03391941,  0.01684909,
       -0.00230579, -0.00816346, -0.00582754,  0.00312959, -0.00460658,
        0.00868998,  0.01801113,  0.01449941,  0.02080147,  0.01652444,
        0.0196895 ,  0.00422476,  0.01172778,  0.00763011, -0.01054559,
       -0.00165502,  0.00720921,  0.0138874 ,  0.01340267,  0.01025434,
        0.02506174, -0.00119501,  0.01922347,  0.00041182,  0.04547767,
       -0.00439591,  0.01565517,  0.01818176,  0.00529613, -0.00056292,
        0.02399599,  0.00814218, -0.00520699,  0.00362731, -0.00

In [210]:
# Kyuhwan 과 matrix 를 내적하는 코드

np.dot(kyuhwan_vector, matrix_vector)

0.703157

In [211]:
# 나의 Braveheart 에 대한 선호도는 어떻게 예측할지

braveheart = title_to_idx['braveheart (1995)']
braveheart_vector = als_model.item_factors[braveheart]
np.dot(kyuhwan_vector, braveheart_vector)

0.8252781

# 6. 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

In [212]:
# 영화 MATRIX 와 비슷한 영화 추천받기

favorite_movie = 'matrix, the (1999)'
movie_id = title_to_idx[favorite_movie]

similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(124, 0.17165849),
 (92, 0.13242202),
 (62, 0.11625342),
 (141, 0.100910746),
 (107, 0.0959315),
 (145, 0.09440174),
 (200, 0.09367811),
 (375, 0.09123104),
 (175, 0.08721111),
 (75, 0.08122405),
 (44, 0.07078689),
 (317, 0.07061444),
 (193, 0.065104574),
 (117, 0.06492605),
 (236, 0.063416556)]

In [214]:
#이름 표시하기

idx_to_title = {v:k for k,v in title_to_idx.items()}
[idx_to_title[i[0]]for i in similar_movie]

['matrix, the (1999)',
 'terminator 2: judgment day (1991)',
 'total recall (1990)',
 'fugitive, the (1993)',
 'jurassic park (1993)',
 'fifth element, the (1997)',
 'terminator, the (1984)',
 'face/off (1997)',
 'men in black (1997)',
 'hunt for red october, the (1990)',
 'star wars: episode iv - a new hope (1977)',
 'twelve monkeys (1995)',
 'alien (1979)',
 'star wars: episode v - the empire strikes back (1980)',
 'speed (1994)']

# 7. 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [215]:
# 영화 추천받기

user = user_to_idx['kyuhwan']

#recommend 에서는 user * item CSR Matrix 를 받습니다.

movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items = True)
movie_recommended

[(121, 0.67501485),
 (48, 0.66143584),
 (107, 0.5863904),
 (23, 0.5243261),
 (175, 0.52059937),
 (141, 0.4857054),
 (51, 0.4561411),
 (222, 0.41491657),
 (38, 0.4061038),
 (0, 0.39747977),
 (62, 0.394471),
 (248, 0.37344927),
 (99, 0.35124275),
 (269, 0.3138939),
 (200, 0.3123231),
 (117, 0.3000265),
 (44, 0.29162908),
 (220, 0.2857719),
 (472, 0.26274955),
 (233, 0.25989893)]

In [216]:
[idx_to_title[i[0]]for i in movie_recommended]

['silence of the lambs, the (1991)',
 'saving private ryan (1998)',
 'jurassic park (1993)',
 "schindler's list (1993)",
 'men in black (1997)',
 'fugitive, the (1993)',
 'fargo (1996)',
 'pulp fiction (1994)',
 'sixth sense, the (1999)',
 "one flew over the cuckoo's nest (1975)",
 'total recall (1990)',
 'good will hunting (1997)',
 'american beauty (1999)',
 'goodfellas (1990)',
 'terminator, the (1984)',
 'star wars: episode v - the empire strikes back (1980)',
 'star wars: episode iv - a new hope (1977)',
 'seven (se7en) (1995)',
 'sling blade (1996)',
 'usual suspects, the (1995)']

In [217]:
#이 추천에 기여한 정도 (matrix 의 경우)

matrix = title_to_idx['matrix, the (1999)']
explain = als_model.explain(user, csr_data, itemid = matrix)

In [218]:
[(idx_to_title[i[0]], i[1]) for i in explain [1]]

[('matrix, the (1999)', 0.4323049067077551),
 ('terminator 2: judgment day (1991)', 0.23834276154668294),
 ('braveheart (1995)', 0.020623943050777534),
 ('shawshank redemption, the (1994)', 0.014476792926721117),
 ('once upon a time in america (1984)', -0.015451711413143197)]

우선 처음에 merge 를 할때, ratings 에 미리 추가를 해준뒤, merge 를 해야 데이터가 온전하게 합쳐진다는것을 알았습니다.
장르로도 해보았으나, 별점으로 추천받는것이 조금더 정확하다는것을 알았습니다.
추천용 알고리즘 모델 또한 편리하고 잘 작동해서 흥미로웠습니다. 모델 학습 하이퍼파라미터 같은 경우엔, 정확도가 다소 만족스러워서 따로 변경하진 않았습니다.