## MNIST Movielens data로 영화 추천하기 

ratings == 시청횟수로 해석  
ratings < 3 일경우 영화를 선호하지 않는다고 가정, 제외

In [1]:
import numpy as np
import os
import scipy
import implicit
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import pandas as pd

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


## Load data, propressing

### Rating data

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [5]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


### Title, Genre data load

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies[movies['title']=='Batman Forever (1995)']

Unnamed: 0,movie_id,title,genre
151,153,Batman Forever (1995),Action|Adventure|Comedy|Crime


In [8]:
movies[movies['genre']=='Action|Adventure|Comedy|Crime']

Unnamed: 0,movie_id,title,genre
151,153,Batman Forever (1995),Action|Adventure|Comedy|Crime
1356,1377,Batman Returns (1992),Action|Adventure|Comedy|Crime
3035,3104,Midnight Run (1988),Action|Adventure|Comedy|Crime


In [9]:
#ratings 에 있는 영화,사용자 수
ratings['movie_id'].nunique(),ratings['user_id'].nunique()

(3628, 6039)

In [10]:
#가장 인기 있는 영화 30개(인기순)
cnt = ratings.groupby('movie_id')['user_id'].count()
top30 = cnt.sort_values(ascending=False).head(30)

for i, k in zip(top30.index, top30.values):
    print(movies[movies['movie_id']==i]['title'].values[0],k)


American Beauty (1999) 3211
Star Wars: Episode IV - A New Hope (1977) 2910
Star Wars: Episode V - The Empire Strikes Back (1980) 2885
Star Wars: Episode VI - Return of the Jedi (1983) 2716
Saving Private Ryan (1998) 2561
Terminator 2: Judgment Day (1991) 2509
Silence of the Lambs, The (1991) 2498
Raiders of the Lost Ark (1981) 2473
Back to the Future (1985) 2460
Matrix, The (1999) 2434
Jurassic Park (1993) 2413
Sixth Sense, The (1999) 2385
Fargo (1996) 2371
Braveheart (1995) 2314
Men in Black (1997) 2297
Schindler's List (1993) 2257
Princess Bride, The (1987) 2252
Shakespeare in Love (1998) 2213
L.A. Confidential (1997) 2210
Shawshank Redemption, The (1994) 2194
Godfather, The (1972) 2167
Groundhog Day (1993) 2121
E.T. the Extra-Terrestrial (1982) 2102
Being John Malkovich (1999) 2066
Ghostbusters (1984) 2051
Pulp Fiction (1994) 2030
Forrest Gump (1994) 2022
Terminator, The (1984) 2019
Toy Story (1995) 2000
Fugitive, The (1993) 1941


In [11]:
top30.index

Int64Index([2858,  260, 1196, 1210, 2028,  589,  593, 1198, 1270, 2571,  480,
            2762,  608,  110, 1580,  527, 1197, 2396, 1617,  318,  858, 1265,
            1097, 2997, 2716,  296,  356, 1240,    1,  457],
           dtype='int64', name='movie_id')

In [12]:
# 유저별 영화를 몇번 봤는지에 대한 통계
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

In [13]:
# 유저별 count횟수 중앙값에 대한 통계
user_median = ratings.groupby('user_id')['counts'].median()
user_median.describe()

count    6039.000000
mean        4.055970
std         0.432143
min         3.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: counts, dtype: float64

### 내가 선호하는 영화 5개 ratings에 추가하기

In [14]:
# 5개 영화 픽
my_favorite_movie = ['Men in Black (1997)' , 'Back to the Future (1985)' ,'Sixth Sense, The (1999)',
                    'E.T. the Extra-Terrestrial (1982)','Batman Forever (1995)']
my_favorite = [movies[movies['title'] == name]['movie_id'].values[0] for name in my_favorite_movie]

# A라는 user_id가 위 영화를 5회씩 봤다고 가정.
A = max(ratings['user_id']) + 1
my_playlist = pd.DataFrame({'user_id': [A]*5, 'movie_id': my_favorite, 'counts':[5]*5})

if not ratings.isin({'user_id':[A]})['user_id'].any():  # user_id에 'A'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 
ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,6041,1580,5,
1,6041,1270,5,
2,6041,2762,5,
3,6041,1097,5,
4,6041,153,5,


In [15]:
total = pd.merge(movies,ratings)
col = ['user_id','movie_id','title','counts']
total = total[col]
total

Unnamed: 0,user_id,movie_id,title,counts
0,1,1,Toy Story (1995),5
1,6,1,Toy Story (1995),4
2,8,1,Toy Story (1995),4
3,9,1,Toy Story (1995),5
4,10,1,Toy Story (1995),5
...,...,...,...,...
836478,5682,3952,"Contender, The (2000)",3
836479,5812,3952,"Contender, The (2000)",4
836480,5831,3952,"Contender, The (2000)",3
836481,5837,3952,"Contender, The (2000)",4


### CSR matrix

In [16]:
user_unique = total['user_id'].unique()
title_unique = total['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

In [17]:
user_to_idx[6041]

5193

In [56]:
#title_index 일부 확인
for n,i in enumerate(title_to_idx.items()):
    print(i)
    if n >5:
        break

('Toy Story (1995)', 0)
('Jumanji (1995)', 1)
('Grumpier Old Men (1995)', 2)
('Waiting to Exhale (1995)', 3)
('Father of the Bride Part II (1995)', 4)
('Heat (1995)', 5)
('Sabrina (1995)', 6)


In [19]:
temp_user_data = total['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(total):
    print('user_id column indexing OK!!')
    total['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')


    
temp_title_data = total['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(total):
    print('title column indexing OK!!')
    total['title'] = temp_title_data
else:
    print('title column indexing Fail!!')
total

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,movie_id,title,counts
0,0,1,0,5
1,1,1,0,4
2,2,1,0,4
3,3,1,0,5
4,4,1,0,5
...,...,...,...,...
836478,2696,3952,3627,3
836479,2161,3952,3627,4
836480,1931,3952,3627,3
836481,2162,3952,3627,4


In [20]:
num_user = total['user_id'].nunique()
num_title = total['title'].nunique()

print(f' # of unique users : {num_user} ')
print(f' # of unique movies : {num_title} ')

csr_data = csr_matrix((total.counts, (total.user_id, total.title)), shape= (num_user, num_title))
csr_data

 # of unique users : 6040 
 # of unique movies : 3628 


<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### AlternatingLeastSquares 훈련

In [21]:
# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'


# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=400, regularization=0.01, use_gpu=False, iterations=100, dtype=np.float32)


# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T


# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/100 [00:00<?, ?it/s]

In [22]:
my_index, matrix_index = user_to_idx[6041], title_to_idx['Batman Forever (1995)']
my_vector, matrix_vector = als_model.user_factors[my_index], als_model.item_factors[matrix_index]

In [23]:
my_vector

array([ 0.09993128, -0.08839921, -0.3111683 ,  0.15907115,  0.2619438 ,
        0.08184379,  0.05914115, -0.12925299, -0.30305237,  0.00862933,
        0.15628414, -0.08848069, -0.18151495, -0.24334416,  0.13928089,
        0.11886246,  0.09866893,  0.09911662,  0.07537232, -0.02197685,
        0.14883973,  0.09379678,  0.12723489, -0.15085089,  0.08545969,
        0.0327779 , -0.07027385, -0.04827826,  0.31686544,  0.32400146,
       -0.05045943, -0.19854288, -0.27425268,  0.19589289, -0.13774608,
       -0.2077292 ,  0.29093224,  0.10136276,  0.1882746 ,  0.24601652,
        0.0813208 ,  0.04476741,  0.40148175,  0.10179494, -0.11515841,
        0.0504953 ,  0.1483344 ,  0.00127208, -0.0174896 , -0.29255328,
       -0.07046296, -0.2604008 , -0.05686668,  0.06729379,  0.09342004,
        0.08644832, -0.0382722 ,  0.10634198, -0.04869567, -0.29247594,
       -0.40277055, -0.19790247,  0.04464735,  0.4285053 , -0.27070093,
        0.06956217, -0.0014123 , -0.03834092, -0.19473729,  0.02

In [24]:
matrix_vector

array([ 0.03958609,  0.00787926, -0.02885679,  0.00462308,  0.0089391 ,
        0.00049481,  0.01326965,  0.01895494, -0.01519008, -0.02315713,
       -0.01905125, -0.00332799, -0.02458555, -0.01544077, -0.01288255,
        0.03322029, -0.00932579, -0.00505351,  0.00969823,  0.00657097,
        0.01740834,  0.03074496,  0.01064942, -0.03572135,  0.03037043,
        0.02438041,  0.00090942,  0.01753327,  0.02861254,  0.03292083,
        0.01952961,  0.00700525, -0.01539102,  0.0072593 , -0.01488146,
       -0.02458254,  0.00507273,  0.02944311,  0.01958509, -0.00855321,
        0.00038363, -0.00124458,  0.00169099,  0.02951275, -0.02199455,
        0.00828011,  0.0078014 ,  0.01960475,  0.02099241, -0.01095318,
       -0.02136962,  0.0074155 ,  0.01862469,  0.01110194,  0.02937974,
        0.00561243, -0.01034121,  0.00935025, -0.00664274, -0.00746652,
       -0.00598264,  0.0015738 , -0.00542581,  0.04606044, -0.02171431,
        0.00017176, -0.00672594, -0.00947547,  0.00497545,  0.04

In [25]:
#Batman Forever (1995)
np.dot(my_vector, matrix_vector)

0.54911697

In [26]:
#Big Green, The (1995): 52 
BigGreen_vector = als_model.item_factors[52]
np.dot(my_vector, BigGreen_vector)

0.02088853

### 내가 좋아하는 영화와 비슷한거 추천받기

In [27]:
idx_to_title = {v:k for k,v in title_to_idx.items()}

favorite_movie = title_to_idx['Batman Forever (1995)']

tmp1 = als_model.similar_items(favorite_movie, N=15)
tmp2 = [ (idx_to_title[i[0]], i[1]) for i in tmp1]
similar_df = pd.DataFrame(tmp2, columns = ['title', 'similarity'])
similar_df

Unnamed: 0,title,similarity
0,Batman Forever (1995),1.0
1,Batman & Robin (1997),0.635159
2,Batman Returns (1992),0.569527
3,Dick Tracy (1990),0.327803
4,Beverly Hills Cop III (1994),0.310447
5,"Flintstones, The (1994)",0.307313
6,Mortal Kombat: Annihilation (1997),0.284237
7,Batman (1989),0.283343
8,Pollyanna (1960),0.275332
9,Kids of the Round Table (1995),0.274816


In [28]:
movies[movies['title']=='Batman Forever (1995)']

Unnamed: 0,movie_id,title,genre
151,153,Batman Forever (1995),Action|Adventure|Comedy|Crime


In [29]:
movies[movies['genre']=='Action|Adventure|Comedy|Crime']

Unnamed: 0,movie_id,title,genre
151,153,Batman Forever (1995),Action|Adventure|Comedy|Crime
1356,1377,Batman Returns (1992),Action|Adventure|Comedy|Crime
3035,3104,Midnight Run (1988),Action|Adventure|Comedy|Crime


### 내가 좋아할 만한거 추천받기

In [30]:
user = user_to_idx[6041]

tmp1 = als_model.recommend(user, csr_data, N=15, filter_already_liked_items=True)

tmp2 = [ (idx_to_title[i[0]], i[1]) for i in tmp1]
recommend_df = pd.DataFrame(tmp2, columns = ['title', 'recommend_score'])
recommend_df

Unnamed: 0,title,recommend_score
0,Batman Returns (1992),0.292337
1,Batman & Robin (1997),0.22695
2,"Thing, The (1982)",0.157031
3,Batman (1989),0.156332
4,Breakfast at Tiffany's (1961),0.137722
5,Mars Attacks! (1996),0.121132
6,"Abyss, The (1989)",0.120819
7,M*A*S*H (1970),0.119869
8,Hackers (1995),0.118209
9,Contact (1997),0.116534


## 회고록

- np.dot(my_vector, matrix_vector) 수치가 너무 낮았다. factors와 iter를 증가시키면 괜찮아진다는데 한번 실험해보았다.  
1) factor: 100 iter: 30 dot : 0.2027  
2) factor: 200 iter: 30 dot : 0.3628  
3) factor: 200 iter: 100 dot : 0.3480
4) factor: 400 iter: 100 dot : 0.5491  
RESULT: .54까지 확인했으니, 그만 알아보도록 하자. --- factor와 iter 수치 증가에 따라 유의미하게 수치가 상승함.

하나의 영화를 기반한 추천이나 유저기반 추천 리스트를 보았을때 시리즈물을 위주로 선 추천후 비슷해보이는 영화들이 보인다. 다만 유저기반 추천의 경우 factor가 늘어남에 따라 score는 낮아지는 현상을 보였다. 시리즈물 조차 0.2정도의 수치를 나타내는 것을 볼때 score의 '절대적 의미' 보다 '상대적 의미'가 더 큰 것인 것 같은 느낌이 든다.