In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error

In [2]:
def get_rmse(R, P, Q, non_zeros) : 
    error = 0
    # 두 개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 null 이 아닌 값의 위치 인덱스 추출 >> 실제 R 행렬과 예측행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [3]:
def matrix_factorization(R, K, steps = 200, learning_rate = 0.01, r_lambda = 0.01) :
    num_users, num_items = R.shape
    # P와 Q 매트릭스 크기 지정, 정규분포를 가진 Random 값으로 입력
    
    np.random.seed(32)
    P = np.random.normal(scale=1./K, size = (num_users, K))
    Q = np.random.normal(scale = 1./K, size = (num_items, K))
    
    break_count = 0
    
    # R>0인 행, 열 위치, 값을 non_zeros 리스트 객체에 저장
    
    non_zeros = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
    
    # P와 Q 매트릭스 계속 업데이트 (확률적 경사하강법)
    for step in tqdm_notebook(range(steps)) :
        for i, j, r in non_zeros :
            # 실제 값과 예측 값의 차이인 오류 값 구하기
            eij = r - np.dot(P[i, : ], Q[j, :].T)
            
            ## 경사하강법 기반의 행렬분해 업데이트식 적용
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print(" ### iteration step : ", step, "rmse : ", rmse)
    
    return P, Q
    
            
        

In [4]:
movies = pd.read_csv('./ml_latest_small/movies.csv')
ratings = pd.read_csv('./ml_latest_small/ratings.csv')

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
ratings = ratings.iloc[:, :-1]

In [8]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [9]:
ratings_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')
ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [10]:
# 사용자 - 아이템 행렬 : R(원본 행렬)

# title 컬럼을 얻기 위해 movies와 조인 수행

rating_movies = pd.merge(ratings, movies, on= 'movieId')


In [11]:
ratings_matrix = rating_movies.pivot_table('rating', index = 'userId', columns = 'title')
ratings_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [12]:
ratings_matrix.shape

(610, 9719)

In [13]:
%%time

# 경사하강법 이용, 행렬 분해

P, Q = matrix_factorization(ratings_matrix.values, K = 50)
# K : 잠재요인
# stpes = 200 (default) 200번 반복수행
# learning_rate : 학습률 (0.01 default)
# r_lambda : 규제 계수 (0.01 default)

pred_matrix = np.dot(P, Q.T)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm_notebook(range(steps)) :


  0%|          | 0/200 [00:00<?, ?it/s]

 ### iteration step :  0 rmse :  2.905044129579142
 ### iteration step :  10 rmse :  0.732370917493066
 ### iteration step :  20 rmse :  0.5100363944903933
 ### iteration step :  30 rmse :  0.3699361125363289
 ### iteration step :  40 rmse :  0.2932843562797861
 ### iteration step :  50 rmse :  0.24954823074373997
 ### iteration step :  60 rmse :  0.22276405528840862
 ### iteration step :  70 rmse :  0.20508651520119386
 ### iteration step :  80 rmse :  0.19263955489343895
 ### iteration step :  90 rmse :  0.18341898417277466
 ### iteration step :  100 rmse :  0.1763180745587397
 ### iteration step :  110 rmse :  0.17068341643363832
 ### iteration step :  120 rmse :  0.1661057371742
 ### iteration step :  130 rmse :  0.1623156142546809
 ### iteration step :  140 rmse :  0.15912824311817744
 ### iteration step :  150 rmse :  0.15641247796930127
 ### iteration step :  160 rmse :  0.15407257484283515
 ### iteration step :  170 rmse :  0.1520369387715121
 ### iteration step :  180 rmse :  

In [16]:
# >> 경사하강법이 진행되면서 RMSE 값이 계속 줄어들었다

# 예측 평점 행렬 생성
ratings_pred_matrix = pd.DataFrame(data = pred_matrix, index = ratings_matrix.index, columns = ratings_matrix.columns)

In [17]:
ratings_pred_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.273963,3.968860,3.502277,4.632358,4.294274,1.254869,3.464004,2.205209,3.948687,4.113900,...,1.368422,3.810297,3.712173,2.813051,2.790791,3.974396,3.714110,2.211467,3.975657,0.821587
2,3.148575,3.554164,3.178554,4.162415,4.091128,1.285358,3.447296,1.966200,3.697637,3.606388,...,0.999770,4.056075,3.326419,2.637834,2.309137,4.143337,2.524310,1.829345,4.663548,0.718622
3,2.205413,1.824068,1.640913,2.220948,2.171137,0.659704,1.279714,1.451641,2.688755,1.851174,...,0.671087,2.570023,2.378504,1.796954,1.326315,1.765659,0.912099,1.074076,0.469818,0.333047
4,2.519857,3.198844,2.794729,3.395215,3.174492,0.771050,2.925133,1.863895,1.848199,2.807664,...,1.146045,3.392645,2.339116,2.042912,1.890475,2.494369,0.813936,1.193126,4.367883,0.584764
5,2.361774,3.447209,2.993070,3.670116,3.811139,1.023205,2.135356,1.831460,1.142699,3.040245,...,0.852795,4.107850,2.954328,2.063017,1.943803,4.484902,1.872358,1.486261,2.772045,0.603944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.050533,3.362204,2.977000,3.823673,3.654721,1.011492,2.280782,2.252809,3.672362,3.484556,...,1.068492,3.414527,3.350380,2.355789,2.341239,3.381446,2.481038,1.618138,3.702078,0.639345
607,2.548289,3.273925,3.048512,4.115281,3.985602,1.176656,3.791175,1.829326,5.047535,3.298244,...,1.024577,3.431741,3.471967,2.277821,2.324876,4.147333,2.894677,2.001582,4.444307,0.613166
608,2.376587,3.017929,2.697119,3.813240,3.556881,0.996095,2.412342,1.707802,3.858543,3.099463,...,0.776110,3.411364,3.038113,2.108248,2.110086,4.494525,3.522987,1.829911,3.090396,0.645649
609,2.681859,3.197889,2.827162,3.780976,3.635769,1.056599,3.044660,1.761487,2.932878,3.251109,...,0.974382,3.292826,3.007079,2.221044,2.141424,3.583822,2.914998,1.616827,3.325348,0.630064


In [18]:
ratings_pred_matrix.shape

(610, 9719)

In [19]:
ratings_matrix.shape

(610, 9719)

In [20]:
ratings_pred_matrix[:10]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.273963,3.96886,3.502277,4.632358,4.294274,1.254869,3.464004,2.205209,3.948687,4.1139,...,1.368422,3.810297,3.712173,2.813051,2.790791,3.974396,3.71411,2.211467,3.975657,0.821587
2,3.148575,3.554164,3.178554,4.162415,4.091128,1.285358,3.447296,1.9662,3.697637,3.606388,...,0.99977,4.056075,3.326419,2.637834,2.309137,4.143337,2.52431,1.829345,4.663548,0.718622
3,2.205413,1.824068,1.640913,2.220948,2.171137,0.659704,1.279714,1.451641,2.688755,1.851174,...,0.671087,2.570023,2.378504,1.796954,1.326315,1.765659,0.912099,1.074076,0.469818,0.333047
4,2.519857,3.198844,2.794729,3.395215,3.174492,0.77105,2.925133,1.863895,1.848199,2.807664,...,1.146045,3.392645,2.339116,2.042912,1.890475,2.494369,0.813936,1.193126,4.367883,0.584764
5,2.361774,3.447209,2.99307,3.670116,3.811139,1.023205,2.135356,1.83146,1.142699,3.040245,...,0.852795,4.10785,2.954328,2.063017,1.943803,4.484902,1.872358,1.486261,2.772045,0.603944
6,2.409541,3.09308,2.955488,3.686453,3.484939,1.102389,3.774179,1.763484,3.999174,3.098129,...,1.028544,3.373852,2.82259,2.162152,1.980726,3.255319,2.438597,1.694335,3.966651,0.666793
7,2.025231,3.169534,2.779727,3.674555,3.724892,0.839802,1.975337,1.727208,2.082577,2.752331,...,0.681835,3.481439,2.427917,1.901725,1.767718,2.745958,2.03715,1.569366,3.196775,0.536124
8,2.229575,2.991344,2.583305,3.367429,3.26797,0.871226,2.572229,1.850839,3.056741,2.995813,...,0.71666,2.986955,3.272067,1.987693,1.949978,4.395795,2.584269,1.7244,3.221496,0.541917
9,2.699281,3.469003,3.091729,3.617322,3.273078,0.969142,2.614965,1.992483,4.201491,3.099495,...,1.196121,3.455241,2.832457,2.304098,2.139509,2.725868,1.063035,1.900494,2.717043,0.544619
10,2.17041,2.511963,2.359557,3.426292,3.496949,1.230205,3.883781,1.36931,5.083879,2.932611,...,0.828698,3.520232,2.899237,1.846301,1.711622,2.103183,1.548736,1.438828,2.996695,0.500477


In [21]:
ratings_matrix[:10]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,1.0,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [22]:
# 9번 사용자에게 아직 보지 않은 영화(unseen_movies)를 예측 평점 높은 순으로 추천해주기

def get_unseen_movies(ratings_matrix, userId) :
    # userId 로 입력받은 사용자(여기서 9번) 의 모든 영화정보 추출 >> Series 반환함
    # 반환된 user_rating은 영화명(title)을 index로 가지는 Series 객체임
    
    user_rating = ratings_matrix.loc[userId , : ]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임
    # >> 대상 index 추출하여 list 객체로 만들면 되겠다
    
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만들자 그리고 이미 본 영화 리스트(already_seen) 빼주자
    # 그러면 안 본 영화 리스트가 나오겠네
    
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension 으로 already_seen 에 해당하는 movie는 movies_list에서 제외함
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list


In [23]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n = 10) :
    # 예측 평점 데이터프레임에서 userId와 unseen_list 로 들어온 영화명 컬럼을 추출한다.
    # 가장 예측 평점이 높은 순으로 정렬
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending = False)[ : top_n]
    
    return recomm_movies

In [26]:
# 안본영화 리스트 뽑기
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 잠재요인 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, 10)
recomm_movies

title
This Is Spinal Tap (1984)                                5.535472
Monty Python and the Holy Grail (1975)                   5.509462
Star Wars: Episode V - The Empire Strikes Back (1980)    5.264972
Yojimbo (1961)                                           5.240109
Lord of the Rings: The Return of the King, The (2003)    5.228942
Blade Runner (1982)                                      5.218429
Gosford Park (2001)                                      5.183485
Toy Story 3 (2010)                                       5.044006
Spirited Away (Sen to Chihiro no kamikakushi) (2001)     5.036169
Austin Powers: The Spy Who Shagged Me (1999)             5.035353
Name: 9, dtype: float64

In [27]:
# 평점 데이터를 데이터 프레임으로 변환, 출력

pd.DataFrame(recomm_movies.values,
             index = recomm_movies.index,
             columns= ['pred_score'])

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
This Is Spinal Tap (1984),5.535472
Monty Python and the Holy Grail (1975),5.509462
Star Wars: Episode V - The Empire Strikes Back (1980),5.264972
Yojimbo (1961),5.240109
"Lord of the Rings: The Return of the King, The (2003)",5.228942
Blade Runner (1982),5.218429
Gosford Park (2001),5.183485
Toy Story 3 (2010),5.044006
Spirited Away (Sen to Chihiro no kamikakushi) (2001),5.036169
Austin Powers: The Spy Who Shagged Me (1999),5.035353


In [None]:
# 통찰(insight)
# MF(행렬 분해) 기반의 잠재요인 협업 필터링으로 영화를 추천한 결과
# 주로 SF 계열의 어두운 분위기 영화를 추천되었음
