<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-loading" data-toc-modified-id="Data-loading-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data loading</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Preprocessing</a></span></li><li><span><a href="#데이터-분석" data-toc-modified-id="데이터-분석-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>데이터 분석</a></span></li><li><span><a href="#모델에-활용하기위한-전처리" data-toc-modified-id="모델에-활용하기위한-전처리-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>모델에 활용하기위한 전처리</a></span></li><li><span><a href="#모델설계(matrix-factorization)" data-toc-modified-id="모델설계(matrix-factorization)-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>모델설계(matrix factorization)</a></span></li><li><span><a href="#MF-모델-학습하기" data-toc-modified-id="MF-모델-학습하기-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>MF 모델 학습하기</a></span></li><li><span><a href="#영화-추천받기" data-toc-modified-id="영화-추천받기-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>영화 추천받기</a></span></li><li><span><a href="#회고" data-toc-modified-id="회고-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>회고</a></span></li></ul></div>

# Data loading 

In [30]:
import numpy as np
import scipy
import implicit
import pandas as pd
import os

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


In [46]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Preprocessing

In [47]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [48]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [49]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [50]:
ratings.head()

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [51]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 데이터 분석

In [52]:
data = pd.merge(ratings, movies, on='movie_id')
data

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...
836473,5851,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western
836474,5854,3026,4,958346883,Slaughterhouse (1987),Horror
836475,5854,690,3,957744257,"Promise, The (Versprechen, Das) (1994)",Romance
836476,5938,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)",Documentary


In [53]:
# 소문자로 바꾸기
data['title'] = data['title'].str.lower()
data['genre'] = data['genre'].str.lower()
data

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,one flew over the cuckoo's nest (1975),drama
1,2,1193,5,978298413,one flew over the cuckoo's nest (1975),drama
2,12,1193,4,978220179,one flew over the cuckoo's nest (1975),drama
3,15,1193,4,978199279,one flew over the cuckoo's nest (1975),drama
4,17,1193,5,978158471,one flew over the cuckoo's nest (1975),drama
...,...,...,...,...,...,...
836473,5851,3607,5,957756608,one little indian (1973),comedy|drama|western
836474,5854,3026,4,958346883,slaughterhouse (1987),horror
836475,5854,690,3,957744257,"promise, the (versprechen, das) (1994)",romance
836476,5938,2909,4,957273353,"five wives, three secretaries and me (1998)",documentary


In [54]:
# 유저 수
data['user_id'].nunique() #num of unique를 출력

6039

In [55]:
# 영화수
data['movie_id'].nunique()

3628

In [56]:
# 유저별 영화대한 통계
user_count = data.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

In [57]:
# 인기많은 영화
movie_count = data.groupby('title')['user_id'].count()
movie_count.sort_values(ascending=False).tail(30)

title
fall time (1995)                                                   1
baby, the (1973)                                                   1
mascara (1999)                                                     1
open season (1996)                                                 1
eaten alive (1976)                                                 1
fausto (1993)                                                      1
project moon base (1953)                                           1
ten benny (1997)                                                   1
telling you (1998)                                                 1
resurrection man (1998)                                            1
retro puppetmaster (1999)                                          1
outside ozona (1998)                                               1
death in brunswick (1991)                                          1
last resort (1994)                                                 1
criminal lovers (les amants 

In [58]:
my_favorite = ['matrix, the (1999)' , 'back stage (2000)' ,'eaten alive (1976)' ,'tokyo fist (1995)' ,'beauty (1998)']
my_playlist = pd.DataFrame({'user_id': ['5949']*5, 'title': my_favorite, 'counts':[30]*5})

if not data.isin({'user_id':['5949']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    data = data.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

data.tail(10) 

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
836473,5851,3607.0,5,957756600.0,one little indian (1973),comedy|drama|western
836474,5854,3026.0,4,958346900.0,slaughterhouse (1987),horror
836475,5854,690.0,3,957744300.0,"promise, the (versprechen, das) (1994)",romance
836476,5938,2909.0,4,957273400.0,"five wives, three secretaries and me (1998)",documentary
836477,5948,1360.0,5,1016564000.0,identification of a woman (identificazione di ...,drama
0,5949,,30,,"matrix, the (1999)",
1,5949,,30,,back stage (2000),
2,5949,,30,,eaten alive (1976),
3,5949,,30,,tokyo fist (1995),
4,5949,,30,,beauty (1998),


# 모델에 활용하기위한 전처리

In [61]:
# 고유한 유저, 영화를 찾아내는 코드
user_unique = data['user_id'].unique()
movie_unique = data['title'].unique()
print(len(user_unique))
print(len(movie_unique))

# unique 유저, 아티스트를 찾아내 v:k(indexing:user)를 맵핑하는 구문
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}
user_to_idx
movie_to_idx

6040
3628


{"one flew over the cuckoo's nest (1975)": 0,
 'james and the giant peach (1996)': 1,
 'my fair lady (1964)': 2,
 'erin brockovich (2000)': 3,
 "bug's life, a (1998)": 4,
 'princess bride, the (1987)': 5,
 'ben-hur (1959)': 6,
 'christmas story, a (1983)': 7,
 'snow white and the seven dwarfs (1937)': 8,
 'wizard of oz, the (1939)': 9,
 'beauty and the beast (1991)': 10,
 'gigi (1958)': 11,
 'miracle on 34th street (1947)': 12,
 "ferris bueller's day off (1986)": 13,
 'sound of music, the (1965)': 14,
 'airplane! (1980)': 15,
 'tarzan (1999)': 16,
 'bambi (1942)': 17,
 'awakenings (1990)': 18,
 'big (1988)': 19,
 'pleasantville (1998)': 20,
 'wallace & gromit: the best of aardman animation (1996)': 21,
 'back to the future (1985)': 22,
 "schindler's list (1993)": 23,
 'meet joe black (1998)': 24,
 'pocahontas (1995)': 25,
 'e.t. the extra-terrestrial (1982)': 26,
 'titanic (1997)': 27,
 'ponette (1996)': 28,
 'close shave, a (1995)': 29,
 'antz (1998)': 30,
 'girl, interrupted (1999)':

In [62]:
# 인덱싱이 잘 되었는지 확인
print(user_to_idx['5949'])     
print(movie_to_idx['matrix, the (1999)'])

6039
124


In [64]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드--> 복잡한 user_id와 movie를 인덱싱값으로 바꾸는 코드
# get(): Key로 Value얻기(get)
# map(f, iterable)은 함수(f)와 반복 가능한 데이터를 입력으로 받는다. map 함수는 입력받은 데이터의 각 요소에 함수 f를 적용한 결과를 리턴하는 함수이다.
# map()함수를 왜 쓰는지 확인필요. --> https://codinglevelup.tistory.com/83

temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# movie_to_idx을 통해 movie의 'title' 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = data['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(data):
    print('movie column indexing OK!!')
    data['title'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

data

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,0,1193.0,5,978300760.0,0,drama
1,1,1193.0,5,978298413.0,0,drama
2,2,1193.0,4,978220179.0,0,drama
3,3,1193.0,4,978199279.0,0,drama
4,4,1193.0,5,978158471.0,0,drama
...,...,...,...,...,...,...
0,6039,,30,,124,
1,6039,,30,,3480,
2,6039,,30,,3370,
3,6039,,30,,3441,


# 모델설계(matrix factorization)

In [65]:
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_movie = data['title'].nunique()

print(num_user)
print(num_movie)

csr_data = csr_matrix((data.counts, (data.user_id, data.title)), shape= (num_user, num_movie))
csr_data

# csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])

6040
3628


<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# MF 모델 학습하기

In [67]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [68]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [69]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [70]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [71]:
SiHyeok, matrix = user_to_idx['5949'], movie_to_idx['matrix, the (1999)']
SiHyeok_vector, matrix_vector = als_model.user_factors[SiHyeok], als_model.item_factors[matrix]

In [72]:
SiHyeok_vector

array([ 0.32618383,  0.19303799,  0.32433218,  0.09497146,  0.60978365,
       -0.6598016 ,  1.7227674 ,  0.21951482, -0.17593342, -0.59955764,
       -0.0715293 ,  0.15206252, -0.3426157 ,  0.09569794,  0.6997509 ,
        1.0230776 , -0.82238346,  1.289791  , -1.4906485 ,  0.49464568,
       -0.1973952 ,  2.2219403 , -0.8553256 ,  1.1447054 ,  0.41198897,
       -1.444769  , -1.7540474 ,  0.34912118, -0.15432236,  0.55038625,
        0.56565475, -1.2053916 ,  0.7175467 ,  1.2500508 ,  1.3924603 ,
       -0.25133678,  0.55669683, -1.206602  , -1.4697223 , -1.8347267 ,
        0.5398969 , -0.6134697 ,  0.0285067 , -0.3233812 ,  1.8209544 ,
       -0.46179253,  1.5177987 ,  0.82245713, -1.7226928 ,  0.03663704,
        0.5646951 , -1.6740274 ,  1.7690079 , -0.14171869,  1.9723451 ,
        2.2792697 , -0.85445154,  0.9316445 , -0.09802068,  1.7872363 ,
       -0.19456878, -0.3261038 , -0.17512625,  0.09810306, -1.724945  ,
        0.45459795, -1.3925108 , -1.9736452 ,  1.2626199 ,  0.32

In [73]:
matrix_vector

array([ 0.01847467, -0.00887456,  0.01196309,  0.02674105,  0.00017046,
       -0.01798126,  0.02820741,  0.01215879, -0.00048436,  0.0139112 ,
        0.01119352,  0.01681052,  0.00659255,  0.00616379, -0.00458947,
       -0.00080127,  0.00873523,  0.03659083, -0.0157169 ,  0.01547554,
        0.00409913,  0.02625578,  0.01172511, -0.00169533,  0.01133032,
       -0.0020144 , -0.00671136,  0.02372355, -0.00957421, -0.00352102,
        0.00869877, -0.02809072,  0.00606648,  0.01300998,  0.00249483,
       -0.00920174, -0.01792815,  0.01108343,  0.00458632, -0.00527933,
        0.00313761, -0.00511267, -0.0188315 , -0.00906956,  0.04149845,
       -0.00176747,  0.00575252, -0.007669  , -0.00458829,  0.03935952,
       -0.00590109,  0.00775053,  0.0125361 , -0.03189727,  0.0242193 ,
        0.03023531, -0.00581784,  0.01890697,  0.00865569,  0.02516286,
       -0.00187069,  0.00216497, -0.00361784,  0.02786204,  0.00846371,
        0.0200375 ,  0.00518239, -0.01527593,  0.02167612,  0.02

In [74]:
# SiHyeok과 matrix를 내적하는 코드 -> 선호하는 영화
np.dot(SiHyeok_vector, matrix_vector)

0.9510372

In [76]:
# 선호하지 않는 영화
backstage = artist_to_idx['back stage (2000)']
backstage_vector = als_model.item_factors[backstage]
np.dot(SiHyeok_vector, backstage_vector)

0.30393395

# 영화 추천받기

In [77]:
favorite_movie = 'matrix, the (1999)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(124, 1.0000001),
 (92, 0.76482785),
 (62, 0.63607883),
 (141, 0.5825625),
 (200, 0.57684237),
 (145, 0.57560116),
 (375, 0.5370542),
 (107, 0.5124092),
 (317, 0.48930123),
 (175, 0.48214877),
 (3441, 0.45190972),
 (3370, 0.44953355),
 (3386, 0.44005257),
 (3480, 0.43735176),
 (44, 0.43381098)]

In [78]:
#movie_to_idx 를 뒤집어, index로부터 movie 이름을 얻는 dict를 생성합니다. 
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['matrix, the (1999)',
 'terminator 2: judgment day (1991)',
 'total recall (1990)',
 'fugitive, the (1993)',
 'terminator, the (1984)',
 'fifth element, the (1997)',
 'face/off (1997)',
 'jurassic park (1993)',
 'twelve monkeys (1995)',
 'men in black (1997)',
 'tokyo fist (1995)',
 'eaten alive (1976)',
 'beauty (1998)',
 'back stage (2000)',
 'star wars: episode iv - a new hope (1977)']

# 회고
    ALS는 하나의 행렬의 수렴을 끝내고 다른 하나의 행렬의 수렴을 진행한다는 점에서 가장 큰 차이를 보입니다.
    (기존의 SGD가 두 개의 행렬(User Latent, Item Latent)을 동시에 최적화하는 방법)
    예로, 초기 아이템, 사용자행렬을 초기화 후, -->
    아이템행렬을 고정하고 사용자 행렬을 최적화, -->
    사용자행렬을 고정하고 아이템행렬을 최적화, -->
    위절차를 반복하면서 행렬의 최적화값이 달라집니다. 
    해당과정을 반복해가면서 두 행렬 모두 수렴의 근접한 값을 찾아주는게 ALS알고리즘의 핵심임을 공부하는 노드 였습니다. 
    
    최초 선택된 5개의 my_favorit 리스트에 따라 결과값이 다르게 나오게되기 때문에 직접적으로 영화제목을 입력받기( 물어보기) 보다, 질문을 통해 favorit을 유추한 뒤 상관관계를 도출한다면 더욱 정확한 값을 얻을 수 있을 듯 하다.   
    
    ref)
    1. https://eda-ai-lab.tistory.com/529
    1. https://codinglevelup.tistory.com/83

    