# [EX_15]RECOMMENDATA_IU

### 개요
## * MF 모델 학습 방법을 토대로 내가 좋아할 만한 영화 추천하는 시스템을 제작해 봄.

### 목차

* STEP 0. 환경설정
* STEP 1. 데이터 준비 및 전처리
* STEP 2. 데이터 분석
* STEP 3. 선호영화 ratings에 추가
* STEP 4. CSR Matrix 만들기
* STEP 5. 모델 구성 및 훈련
* STEP 6. 나의 선호도 파악
* STEP 7. 나의 선호도와 비슷한 영화 추천
* STEP 8. 내가 좋아할 만한 영화 추천

* 루브릭
* 회고


## STEP 0. 환경설정

In [1]:
!pip install implicit 
import numpy as np
import pandas as pd
import os

import scipy
import implicit
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares



## STEP 1. 데이터 불러오기 및 전처리

In [2]:
# 데이터 불러오기

# rating_file_path='/content/drive/MyDrive/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
# movie_file_path='/content/drive/MyDrive/aiffel/recommendata_iu/data/ml-1m/movies.dat'

movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## STEP 2. 데이터 분석

* ratings에 있는 유니크한 영화 개수
* ratings에 있는 유니크한 사용자 수
* 가장 인기 있는 영화 30개(인기순)

In [6]:
# ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

In [7]:
# ratings에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

In [8]:
# 가장 인기 있는 영화 30개(인기순)
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_top30 = movie_count.sort_values(ascending=False).head(30)

# 30개의 영화 누적 count수
for i, k in zip(movie_top30.index, movie_top30.values):
    print(movies[movies['movie_id']==i]['title'].values[0],k)

American Beauty (1999) 3211
Star Wars: Episode IV - A New Hope (1977) 2910
Star Wars: Episode V - The Empire Strikes Back (1980) 2885
Star Wars: Episode VI - Return of the Jedi (1983) 2716
Saving Private Ryan (1998) 2561
Terminator 2: Judgment Day (1991) 2509
Silence of the Lambs, The (1991) 2498
Raiders of the Lost Ark (1981) 2473
Back to the Future (1985) 2460
Matrix, The (1999) 2434
Jurassic Park (1993) 2413
Sixth Sense, The (1999) 2385
Fargo (1996) 2371
Braveheart (1995) 2314
Men in Black (1997) 2297
Schindler's List (1993) 2257
Princess Bride, The (1987) 2252
Shakespeare in Love (1998) 2213
L.A. Confidential (1997) 2210
Shawshank Redemption, The (1994) 2194
Godfather, The (1972) 2167
Groundhog Day (1993) 2121
E.T. the Extra-Terrestrial (1982) 2102
Being John Malkovich (1999) 2066
Ghostbusters (1984) 2051
Pulp Fiction (1994) 2030
Forrest Gump (1994) 2022
Terminator, The (1984) 2019
Toy Story (1995) 2000
Fugitive, The (1993) 1941


## STEP 3. 선호 영화 ratings에 추가

In [9]:
# 내가 좋아하는 영화. 
my_favorite_movies = ['Shakespeare in Love (1998)', 'Saving Private Ryan (1998)' ,'Forrest Gump (1994)', 'Jurassic Park (1993)', 'Pulp Fiction (1994)']
my_favorite = [movies[movies['title'] == name]['movie_id'].values[0] for name in my_favorite_movies]

# 내(sammy)가 위 영화를 5번씩 봤다고 가정
sammy = max(ratings['user_id']) + 1
my_playlist = pd.DataFrame({'user_id': [sammy]*5, 'movie_id': my_favorite, 'counts':[5]*5})

if not ratings.isin({'user_id':[sammy]})['user_id'].any():  # user_id에 'sammy'이라는 데이터 없으면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가함. 
ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,6041,2396,5,
1,6041,2028,5,
2,6041,356,5,
3,6041,480,5,
4,6041,296,5,


## STEP 4. CSR 만들기


In [10]:
# 실습 위에 설명보고 이해해서 CSR 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()
csr_data = csr_matrix((ratings['counts'], (ratings.user_id, ratings.movie_id)))
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## STEP 5. 모델 구성 및 학습

In [11]:
# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)


In [12]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [13]:
csr_data_transpose.shape

(3953, 6042)

In [14]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

## STEP 6. 모델이 예측한 나의 선호도 파악

In [15]:
# 영화의 이름을 movie id로 바꿔주는 함수
def movie_name_to_id(name):
    return movies[movies['title']==name]['movie_id'].values[0]

In [16]:
my_vector, favorite_movie_vector = als_model.user_factors[6041], als_model.item_factors[movie_name_to_id('Forrest Gump (1994)')]

In [17]:
my_vector

array([ 0.1623443 , -0.07401556,  0.8240018 , -0.6871601 , -0.04592803,
        0.3423583 ,  0.2267575 ,  0.64966327,  0.18826693, -0.17605864,
       -0.41462815,  1.1102207 , -0.4886768 ,  0.23516387,  1.1871564 ,
       -0.28246757, -0.8502416 , -0.05152407, -0.33376202, -0.20172164,
       -0.50900745,  0.06268447, -0.5305398 , -0.12482007,  0.23337884,
       -0.29522663, -0.87237966, -0.4872223 ,  0.72182083,  0.2568479 ,
       -0.6855538 ,  0.49680376, -0.35669345,  1.1372882 , -0.40675902,
        0.5444049 ,  0.38243684,  0.5493091 ,  0.6741336 , -0.6564884 ,
       -1.0132062 , -0.05601373,  0.17710997,  0.40218547, -0.30773413,
       -0.14729759,  0.18626754, -0.03466138,  0.74686956,  0.21454585,
       -0.18021126,  0.2709526 , -0.2704668 , -0.30240917, -0.09221961,
        0.31588495,  0.58604914, -0.51854223, -0.1308269 ,  0.19419686,
        0.7710362 , -0.46195918, -0.34056166, -0.32727638, -0.24556333,
       -0.97880894, -0.07605082, -0.08660269,  0.36677867,  0.17

In [18]:
# 내가 선호하는 영화의 vector
favorite_movie_vector

array([ 0.00117883,  0.00045915,  0.01572439, -0.0002318 ,  0.01184276,
        0.00932063,  0.02768881,  0.03932397,  0.00316039,  0.00316417,
       -0.02987609,  0.01443362, -0.01685936,  0.02382503,  0.03943511,
        0.01873153, -0.02215546, -0.00220382, -0.02546836,  0.00996889,
       -0.00580163,  0.03431116,  0.03956668, -0.00253962, -0.0032042 ,
        0.02800803, -0.03829354, -0.00887888,  0.01999032, -0.00946937,
       -0.01039126, -0.00522293, -0.02566945,  0.06808751,  0.01257515,
        0.01822375,  0.01788115,  0.01862646,  0.03102417, -0.02919525,
       -0.02346634,  0.0216224 ,  0.00693724,  0.04644457,  0.01714611,
       -0.02775792, -0.01151172, -0.00895393,  0.02374102, -0.00126695,
        0.03012776, -0.00279449,  0.00095082,  0.00720371,  0.01217381,
        0.00106349, -0.0004722 ,  0.00834859, -0.02600373,  0.0354967 ,
        0.01107146, -0.00199127,  0.01352325, -0.01730545, -0.01507779,
       -0.01938126,  0.01094178,  0.02289118,  0.02396724,  0.01

In [19]:
# 영화 이름을 넣으면 추천정도를 알려주는 함수
def score_movie(movie_name):
    my_vector, movie_vector = als_model.user_factors[6041], als_model.item_factors[movie_name_to_id(movie_name)]
    return np.dot(my_vector, movie_vector)

In [20]:
for i in movie_top30.index:
    tmp = movies[movies['movie_id']==i]['title'].values[0]
    print(tmp," :",  score_movie(tmp))

American Beauty (1999)  : 0.5725535
Star Wars: Episode IV - A New Hope (1977)  : 0.20161559
Star Wars: Episode V - The Empire Strikes Back (1980)  : 0.19414464
Star Wars: Episode VI - Return of the Jedi (1983)  : 0.31211582
Saving Private Ryan (1998)  : 0.64457166
Terminator 2: Judgment Day (1991)  : 0.3510162
Silence of the Lambs, The (1991)  : 0.41825268
Raiders of the Lost Ark (1981)  : 0.09789445
Back to the Future (1985)  : 0.16998282
Matrix, The (1999)  : 0.28748482
Jurassic Park (1993)  : 0.4832255
Sixth Sense, The (1999)  : 0.31981602
Fargo (1996)  : 0.40934247
Braveheart (1995)  : 0.51483834
Men in Black (1997)  : 0.35479596
Schindler's List (1993)  : 0.5304626
Princess Bride, The (1987)  : 0.099569336
Shakespeare in Love (1998)  : 0.4842227
L.A. Confidential (1997)  : 0.29624772
Shawshank Redemption, The (1994)  : 0.41647035
Godfather, The (1972)  : 0.068565644
Groundhog Day (1993)  : 0.39360207
E.T. the Extra-Terrestrial (1982)  : 0.19025218
Being John Malkovich (1999)  : 0.

### STEP 7. 선호 영화와 비슷한 영화 추천.

In [21]:
# GodFarther와 비슷한 영화 추천
favorite_movie = 'Braveheart (1995)'
movie_id = movie_name_to_id(favorite_movie)
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(110, 0.99999994),
 (2028, 0.6156363),
 (589, 0.45457485),
 (480, 0.42287278),
 (527, 0.37709224),
 (1580, 0.34008333),
 (1610, 0.33923015),
 (1233, 0.3315871),
 (989, 0.33058628),
 (2858, 0.3092162),
 (1408, 0.30691686),
 (1259, 0.30197212),
 (266, 0.3011503),
 (3753, 0.30077717),
 (318, 0.29726377)]

In [22]:
for i, k in similar_movie:
    print(movies[movies['movie_id']==i]['title'].values[0],' :', k)

Braveheart (1995)  : 0.99999994
Saving Private Ryan (1998)  : 0.6156363
Terminator 2: Judgment Day (1991)  : 0.45457485
Jurassic Park (1993)  : 0.42287278
Schindler's List (1993)  : 0.37709224
Men in Black (1997)  : 0.34008333
Hunt for Red October, The (1990)  : 0.33923015
Boat, The (Das Boot) (1981)  : 0.3315871
Schlafes Bruder (Brother of Sleep) (1995)  : 0.33058628
American Beauty (1999)  : 0.3092162
Last of the Mohicans, The (1992)  : 0.30691686
Stand by Me (1986)  : 0.30197212
Legends of the Fall (1994)  : 0.3011503
Patriot, The (2000)  : 0.30077717
Shawshank Redemption, The (1994)  : 0.29726377


In [23]:
movies[movies['movie_id'].isin([s[0] for s in similar_movie])][['title','genre']]

Unnamed: 0,title,genre
108,Braveheart (1995),Action|Drama|War
263,Legends of the Fall (1994),Drama|Romance|War|Western
315,"Shawshank Redemption, The (1994)",Drama
476,Jurassic Park (1993),Action|Adventure|Sci-Fi
523,Schindler's List (1993),Drama|War
585,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
977,Schlafes Bruder (Brother of Sleep) (1995),Drama
1214,"Boat, The (Das Boot) (1981)",Action|Drama|War
1239,Stand by Me (1986),Adventure|Comedy|Drama
1385,"Last of the Mohicans, The (1992)",Action|Romance|War


## STEP 8. 내가 좋아할 영화들 추천

In [24]:
# user 6041(hong)이 좋아할만한 영화를 추천
user = 6041
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(2858, 0.5725536),
 (527, 0.5304626),
 (110, 0.51483834),
 (593, 0.41825268),
 (318, 0.41647035),
 (608, 0.40934247),
 (1265, 0.393602),
 (1580, 0.35479593),
 (589, 0.35101616),
 (2762, 0.31981602),
 (1210, 0.31211585),
 (1617, 0.29624775),
 (2571, 0.28748482),
 (1233, 0.2856917),
 (1213, 0.26914805),
 (1704, 0.26002482),
 (2997, 0.24599585),
 (2628, 0.24523069),
 (1, 0.24372476),
 (1721, 0.23824495)]

In [25]:
# 좋아할만한 영화와 수치 장르 확인
for i, k in movie_recommended:
    print(movies[movies['movie_id']==i]['title'].values[0],' :',k)

American Beauty (1999)  : 0.5725536
Schindler's List (1993)  : 0.5304626
Braveheart (1995)  : 0.51483834
Silence of the Lambs, The (1991)  : 0.41825268
Shawshank Redemption, The (1994)  : 0.41647035
Fargo (1996)  : 0.40934247
Groundhog Day (1993)  : 0.393602
Men in Black (1997)  : 0.35479593
Terminator 2: Judgment Day (1991)  : 0.35101616
Sixth Sense, The (1999)  : 0.31981602
Star Wars: Episode VI - Return of the Jedi (1983)  : 0.31211585
L.A. Confidential (1997)  : 0.29624775
Matrix, The (1999)  : 0.28748482
Boat, The (Das Boot) (1981)  : 0.2856917
GoodFellas (1990)  : 0.26914805
Good Will Hunting (1997)  : 0.26002482
Being John Malkovich (1999)  : 0.24599585
Star Wars: Episode I - The Phantom Menace (1999)  : 0.24523069
Toy Story (1995)  : 0.24372476
Titanic (1997)  : 0.23824495


In [26]:
movies[movies['movie_id'].isin([m[0] for m in movie_recommended])][['title','genre']]

Unnamed: 0,title,genre
0,Toy Story (1995),Animation|Children's|Comedy
108,Braveheart (1995),Action|Drama|War
315,"Shawshank Redemption, The (1994)",Drama
523,Schindler's List (1993),Drama|War
585,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
589,"Silence of the Lambs, The (1991)",Drama|Thriller
604,Fargo (1996),Crime|Drama|Thriller
1192,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War
1195,GoodFellas (1990),Crime|Drama
1214,"Boat, The (Das Boot) (1981)",Action|Drama|War


## [ 루브릭 ]
1. CSR matrix가 정상적으로 만들어졌다.
* 사용자와 아이템 개수를 바탕으로 정확한 사이즈로 만들었다.<br>

2. MF 모델이 정상적으로 훈련되어 그럴듯한 추천이 이루어졌다.
* 사용자와 아이템 벡터 내적수치가 의미있게 형성되었다.<br>

3. 비슷한 영화 찾기와 유저에게 추천하기의 과정이 정상적으로 진행되었다.

## [ 회고 ]

* !코랩에서 과제를 진행하였는데 도저히 원인을 할 수 없는 오류가 발생해 많은 시간을 허비하였는데, 오늘에서야 조멤버인 호원님이 코랩과 LMS에 설치된 implicit의 버전문제라는 것을 알려줘서 LMS로 코드를 옮겨 겨우 시간내 마무리를 할 수 있었음.
<br><br>
* 추천시스템에서 암묵적 평가의 활용에 대해 이해하게 되었음
<br>
* MF에 대해 이해하고 활용하는 방법에 대해 알게 되었는데 제대로 활용하기 위해서는 좀 더 다양한 적용을 통해 경험을 축적할 필요가 있음
* CSR Matrix를 활용한 추천 시스템에 대해 이해하게 되었음.