In [175]:
import numpy as np
import scipy
import implicit

In [176]:
#Load data
import os
import pandas as pd
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [177]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [178]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [179]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [180]:
ratings['movie_id']

0          1193
1           661
2           914
3          3408
4          2355
           ... 
1000203    1090
1000205    1094
1000206     562
1000207    1096
1000208    1097
Name: movie_id, Length: 836478, dtype: int64

In [181]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [182]:
#ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

In [183]:
#ratings에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

In [184]:
#가장 인기 있는 영화 30개(인기순)
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

In [185]:
movies[movies['movie_id'] == 2858]

Unnamed: 0,movie_id,title,genre
2789,2858,American Beauty (1999),Comedy|Drama


In [186]:
#내가 선호하는 영화를 5가지 골라서 ratings에 추가해 줍시다.
#난 Comedy를 좋아하니까 Comedy를 추가해보자.

movies[movies['genre'] == 'Comedy'].head()

Unnamed: 0,movie_id,title,genre
4,5,Father of the Bride Part II (1995),Comedy
18,19,Ace Ventura: When Nature Calls (1995),Comedy
37,38,It Takes Two (1995),Comedy
51,52,Mighty Aphrodite (1995),Comedy
62,63,Don't Be a Menace to South Central While Drink...,Comedy


In [187]:
ratings.sort_values('user_id')
#6040번이 끝인걸 확인하였다.

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
29,1,745,3,978824268
30,1,2294,4,978824291
31,1,3186,4,978300019
32,1,1566,4,978824330
...,...,...,...,...
999984,6040,2575,4,997453762
999982,6040,2571,4,997454126
999981,6040,1912,3,964828542
999989,6040,2580,4,956705056


In [188]:
#user_id를 6041번으로 상위 5개의 comedy를 새로 append 하였다.
my_favorite = [5 , 19 , 38 , 52 , 63]
my_playlist = pd.DataFrame({'user_id': [6041]*5, 'movie_id': my_favorite, 'counts':[5] * 5, 'timestamp' : 997454126})

if not ratings.isin({'user_id':[6041]})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,6041,5,5,997454126
1,6041,19,5,997454126
2,6041,38,5,997454126
3,6041,52,5,997454126
4,6041,63,5,997454126


확인해보니까 결측값이 되게 많아서 그대로 csr Matrix을 만들면 행열이 안맞아 에러를 뱉었다.
기존 df가 정수 인덱스라도 다시 결측값 없이 인덱싱을 해주기로 한다.

In [189]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [190]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('moivie column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

ratings

user_id column indexing OK!!
moivie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,0,5,978300760
1,0,1,3,978302109
2,0,2,3,978301968
3,0,3,4,978300275
4,0,4,5,978824291
...,...,...,...,...
0,6039,1180,5,997454126
1,6039,2010,5,997454126
2,6039,3360,5,997454126
3,6039,1128,5,997454126


In [191]:
ratings.shape

(836483, 4)

In [192]:
ratings.sort_values('movie_id')

Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,0,5,978300760
856020,5138,0,5,962058076
660393,3979,0,5,965624695
951080,5745,0,4,958353428
464940,2865,0,4,972910138
...,...,...,...,...
970914,5849,3623,5,957756608
971469,5852,3624,4,958346883
971564,5852,3625,3,957744257
983062,5936,3626,4,957273353


In [193]:
# CSR matrix를 직접 만들어 봅시다.
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()
print(num_user)
print(num_movie)

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

6040
3628


<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [194]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [195]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [196]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [197]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [198]:
#내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.
#it takes two의 idx는 38번이기 때문에 38번과의 벡터곱을 확인하였다.
user, it_takes_two = user_to_idx[6038], movie_to_idx[38]
user_vector, it_takes_two_vector = als_model.user_factors[user], als_model.item_factors[it_takes_two]

In [199]:
user_vector

array([-0.16850623, -0.18789962, -1.2057465 ,  0.51447386,  0.60502666,
        0.18457592,  0.81718713,  0.29250893,  0.7398317 , -0.4829902 ,
        0.00990408,  0.70587176, -0.7445383 , -0.26444772,  0.6849598 ,
        0.55741125,  0.42357835, -0.51467186, -0.9735714 , -1.4536824 ,
       -0.98252815,  1.210671  , -0.3655335 , -1.1841981 , -0.80467004,
       -1.0489236 , -0.21128747,  1.5849822 ,  0.7769933 , -0.01956581,
       -0.43050382,  0.9132917 , -0.9932005 , -0.33097953, -1.8343803 ,
       -0.18701136,  0.87183726,  0.789687  , -0.34908143,  0.8212344 ,
        0.43158594,  0.77680093, -1.066318  ,  0.00225695,  0.6277764 ,
       -0.68833715, -1.0234978 ,  0.3409388 ,  0.89158046,  0.26637942,
       -0.8054822 ,  0.72548336,  0.42471746,  0.7716937 ,  0.9491682 ,
       -0.21456781, -0.97776306, -0.06812582,  0.3459741 , -0.20857951,
        0.84937197,  0.60668397, -0.16377318,  1.2699746 ,  0.45801488,
       -0.9581873 , -0.17682607,  0.76088494,  0.06497323,  0.90

In [200]:
it_takes_two_vector

array([-1.95812620e-03,  5.42778336e-03,  6.70106756e-03,  4.46551619e-03,
        5.39142219e-03,  1.93900021e-03,  3.92749644e-04,  4.82109934e-03,
        2.58274074e-03,  4.90046572e-03,  1.60060427e-03,  9.58284305e-04,
        5.99665043e-04,  3.92280705e-03,  1.96360541e-03,  7.20625604e-03,
       -9.20236344e-04,  6.81068795e-03,  5.15136681e-03, -1.18388314e-04,
        3.91148543e-03,  4.35682572e-03, -2.16389727e-03,  1.06281147e-03,
       -2.45718914e-03, -1.61253683e-05,  4.40499978e-04,  4.33412334e-03,
       -5.59308100e-03,  3.07768839e-03, -1.95686193e-03,  1.66167680e-03,
        1.67060655e-03,  4.36183857e-03,  2.11560610e-03,  1.37095235e-03,
        5.70323644e-03, -1.49123307e-05, -4.48742928e-03,  4.43809200e-03,
        7.91033672e-04,  3.76931787e-03,  2.96469266e-03,  5.67135913e-03,
        1.96705153e-03, -3.71031230e-04,  3.85870878e-03,  2.74260552e-03,
        1.21653802e-03,  8.93709541e-04,  5.25989290e-03,  2.57428759e-03,
        6.87314896e-04, -

In [201]:
np.dot(user_vector, it_takes_two_vector)

0.0005712671

1에 가깝게 나왔다! validation은 진행하지 않았지만, train데이터에서는 만족스런 결과가 나온 것 같다.

In [202]:
#비슷한 영화 찾기 : Comedy가 주제인 63번 idx를 대입해보았다.
favorite_artist = 63
artist_id = movie_to_idx[favorite_artist]
similar_movie = als_model.similar_items(artist_id, N=15)
similar_movie

[(2066, 0.9999998),
 (1075, 0.8130117),
 (1033, 0.8033338),
 (748, 0.77734864),
 (2418, 0.7767045),
 (2481, 0.7760277),
 (1641, 0.7735698),
 (2729, 0.76953304),
 (2489, 0.7662233),
 (3194, 0.7563747),
 (3071, 0.75476736),
 (3261, 0.7491186),
 (1014, 0.7491091),
 (167, 0.74009883),
 (1520, 0.7375308)]

In [203]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

[63, 65, 1468, 88, 325, 2060, 413, 174, 255, 833, 1887, 1439, 2195, 1431, 2027]

In [204]:
def get_similar_movie(movie_name : str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [205]:
get_similar_movie(53)

[53, 665, 645, 214, 3816, 670, 264, 1860, 652, 2630]

In [206]:
movies[movies['movie_id'] == 400]

Unnamed: 0,movie_id,title,genre
396,400,Homage (1995),Drama


In [207]:
movies[movies['movie_id'] == 1654]

Unnamed: 0,movie_id,title,genre
1608,1654,FairyTale: A True Story (1997),Children's|Drama|Fantasy


In [208]:
movies[movies['movie_id'] == 104]

Unnamed: 0,movie_id,title,genre
102,104,Happy Gilmore (1996),Comedy


In [209]:
movies[movies['movie_id'] == 357]

Unnamed: 0,movie_id,title,genre
353,357,Four Weddings and a Funeral (1994),Comedy|Romance


In [210]:
movies[movies['movie_id'] == 934]

Unnamed: 0,movie_id,title,genre
922,934,Father of the Bride (1950),Comedy


Comedy가 주제인 영화를 대상으로 비슷한 영화 상위 5개의 주제를 확인해본 결과, 다른 주제들이 추천되는 것을 확인 할 수 있다.
이는 저 영화들이 Comedy의 내용이 어느정도 있지만 다른 주제가 더 강해 해당 주제로 나타내진걸수도 있고, 크게 Comedy를 좋아하는 사람이라도, Comedy만 주구장창 보지 않는 것으로 해석 할 수 있을 것 같다. 딱히 매니아층이 없다고 해야하나
아니면 모델이 잘못됬거나...

In [211]:
#내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.
user = user_to_idx[6038]
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(29, 0.6395985),
 (22, 0.5197422),
 (474, 0.47722834),
 (21, 0.45376083),
 (109, 0.4169619),
 (573, 0.39740908),
 (1348, 0.388946),
 (488, 0.38740167),
 (540, 0.38536698),
 (186, 0.38448322),
 (693, 0.38068163),
 (126, 0.3761547),
 (7, 0.36913657),
 (40, 0.35372272),
 (110, 0.33848006),
 (322, 0.3326108),
 (1471, 0.3312518),
 (50, 0.32915694),
 (1144, 0.32182074),
 (163, 0.32001674)]

In [212]:
movies[movies['movie_id'] == 1143]

Unnamed: 0,movie_id,title,genre
1127,1143,Three Lives and Only One Death (1996),Comedy


In [213]:
movies[movies['movie_id'] == 83]

Unnamed: 0,movie_id,title,genre
82,83,Once Upon a Time... When We Were Colored (1995),Drama


In [214]:
movies[movies['movie_id'] == 1412]

Unnamed: 0,movie_id,title,genre
1389,1412,Some Mother's Son (1996),Drama


In [215]:
movies[movies['movie_id'] == 1176]

Unnamed: 0,movie_id,title,genre
1160,1176,"Double Life of Veronique, The (La Double Vie d...",Drama


In [216]:
movies[movies['movie_id'] == 169]

Unnamed: 0,movie_id,title,genre
167,169,Free Willy 2: The Adventure Home (1995),Adventure|Children's|Drama


나에게 추천되는 5개의 영화를 살펴보았을때도, Comedy가 1등이긴 하지만, 2,3,4,5등의 주제는 Comedy가 아니였다. 아마도 genre 이외에 데이터셋에 다른 요소들이 다른 주제로 결정 지었을 가능성이 크다.