In [1]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [2]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    # P와 Q.T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 Null이 아닌 값의 위치 인덱스 추출
    # 실제 R 행렬과 예측 행렬의 RMSE 계산
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [3]:
# Matrix Facotrizaiton
########################################
## R: 원본 사용자 - 아이템 평점 행렬  ##
## K: 잠재 요인의 차원 수             ##
## steps: SGD의 반복횟수              ##
## leraning_rate: 학습률              ##
## r_lambda: L2 규제 계수             ##
########################################

def mf(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    num_users, num_items = R.shape
    
    # 1. P와 Q 매트릭스의 크기를 지정하고 정규 분포를 가진 랜덤 값 입력
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_cnt = 0
    
    # R>0인 행 위치, 열 위치, 값을 non_zeros list object에 저장
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j]>0]
    
    # SGD기법으로 P와 Q 매트릭스를 계속 업데이트
    for step in range(steps):
        for i,j,r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류값 구함
            eij = r - np.dot(P[i,:], Q[j,:].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij*Q[j,:] - r_lambda * P[i, :])
            Q[j,:] = Q[j,:] + learning_rate*(eij*P[i,:] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step%10) == 0:
            print("### iteration step: ", step, " rmse: ", rmse)
        
    return P, Q

In [4]:
def get_unseen(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating > 0].index.tolist()
    webtoon_list = ratings_matrix.columns.tolist()
    
    unseen_list = [w for w in webtoon_list if w not in already_seen]
    
    return unseen_list

In [5]:
def recomm_webtoon_by_userid(pred_df, userId, unseen_list, top_n=5):
    recomm = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm

In [6]:
ratings = pd.read_csv("ratings.csv")
ratings.head(3)

Unnamed: 0,타임스탬프,신의 탑,참교육,뷰티풀 군바리,윈드브레이커,백수세끼,파이게임,장씨세가 호위무사,여신강림,용사가 돌아왔다,...,스퍼맨 : 전하지 못한 이야기,광장,조조코믹스,싸움독학,이번 생도 잘 부탁해,약한영웅,열렙전사,수희0(tngmlek0),입학용병,투신전생기
0,2021/07/27 3:21:00 오후 GMT+9,10.0,,6.0,,8.0,10.0,,4.0,,...,,,9.0,,,7.0,,,,
1,2021/07/27 3:30:48 오후 GMT+9,,,,,5.0,,,5.0,,...,,,,,,,,,,
2,2021/07/27 3:30:55 오후 GMT+9,8.0,,,,,,,,,...,,,,5.0,,,,,,


In [7]:
ratings = ratings.drop(['타임스탬프'], axis=1)
ratings = ratings.fillna(0)
ratings.head(1)

Unnamed: 0,신의 탑,참교육,뷰티풀 군바리,윈드브레이커,백수세끼,파이게임,장씨세가 호위무사,여신강림,용사가 돌아왔다,바른연애 길잡이,...,스퍼맨 : 전하지 못한 이야기,광장,조조코믹스,싸움독학,이번 생도 잘 부탁해,약한영웅,열렙전사,수희0(tngmlek0),입학용병,투신전생기
0,10.0,0.0,6.0,0.0,8.0,10.0,0.0,4.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0


In [8]:
P, Q = mf(ratings.values, K=4, steps=200, learning_rate=0.006, r_lambda=0.001)
pred_matrix = np.dot(P, Q.T)

### iteration step:  0  rmse:  6.763892932551004
### iteration step:  10  rmse:  1.6986321372903388
### iteration step:  20  rmse:  1.489588969723149
### iteration step:  30  rmse:  1.3717364447811426
### iteration step:  40  rmse:  1.3151451030259622
### iteration step:  50  rmse:  1.2845275063985668
### iteration step:  60  rmse:  1.265021491040341
### iteration step:  70  rmse:  1.2519731874953068
### iteration step:  80  rmse:  1.2430730584542689
### iteration step:  90  rmse:  1.2368347508306596
### iteration step:  100  rmse:  1.2323186655130467
### iteration step:  110  rmse:  1.228944119402739
### iteration step:  120  rmse:  1.2263452919008409
### iteration step:  130  rmse:  1.2242845374151379
### iteration step:  140  rmse:  1.2226046484735118
### iteration step:  150  rmse:  1.2212012972737882
### iteration step:  160  rmse:  1.2200056225059233
### iteration step:  170  rmse:  1.2189725598107395
### iteration step:  180  rmse:  1.2180728068956266
### iteration step:  190  r

In [9]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=ratings.index, columns = ratings.columns)
ratings_pred_matrix.head(3)

Unnamed: 0,신의 탑,참교육,뷰티풀 군바리,윈드브레이커,백수세끼,파이게임,장씨세가 호위무사,여신강림,용사가 돌아왔다,바른연애 길잡이,...,스퍼맨 : 전하지 못한 이야기,광장,조조코믹스,싸움독학,이번 생도 잘 부탁해,약한영웅,열렙전사,수희0(tngmlek0),입학용병,투신전생기
0,9.002822,9.008763,6.802688,7.727397,8.114246,8.958739,9.091928,5.514802,6.966448,9.999285,...,9.36185,9.849547,8.329095,9.308434,10.900554,8.2809,10.017284,5.88711,6.907969,7.695349
1,4.278897,2.924872,2.324738,5.488561,5.055677,5.718598,6.832511,5.260298,5.005189,9.879727,...,5.979696,2.595333,5.314778,4.038708,8.22699,3.11638,4.768583,3.979361,4.052209,4.264629
2,7.198446,5.403979,7.492877,8.136884,5.649603,7.986575,7.704453,5.678548,5.283009,10.07648,...,7.0188,6.405112,7.544284,6.586547,9.800814,6.427944,6.020101,5.328815,4.911277,4.867445


In [10]:
# 0번째 사용자
unseen_list = get_unseen(ratings, 0)
recomm = recomm_webtoon_by_userid(ratings_pred_matrix, 0, unseen_list, top_n=5)
recomm = pd.DataFrame(data = recomm.values, index=recomm.index, columns=['pred_score'])
recomm

Unnamed: 0,pred_score
모죠의 일지,10.977816
하루만 네가 되고 싶어,10.930903
이번 생도 잘 부탁해,10.900554
전지적 독자 시점,10.871865
더 복서,10.340099


In [11]:
# 4번째 사용자
unseen_list = get_unseen(ratings, 4)
recomm = recomm_webtoon_by_userid(ratings_pred_matrix, 4, unseen_list, top_n=5)
recomm = pd.DataFrame(data=recomm.values, index=recomm.index, columns = ['pred_score'])
recomm

Unnamed: 0,pred_score
엽총소년,9.870882
모죠의 일지,9.478645
1초,9.290608
뷰티풀 군바리,9.059761
프리드로우,8.792247


In [12]:
# 7번째 사용자
unseen_list = get_unseen(ratings, 7)
recomm = recomm_webtoon_by_userid(ratings_pred_matrix, 7, unseen_list, top_n=5)
recomm = pd.DataFrame(data=recomm.values, index=recomm.index, columns = ['pred_score'])
recomm

Unnamed: 0,pred_score
하루만 네가 되고 싶어,11.383347
엽총소년,10.221704
튜토리얼 탑의 고인물,10.022909
블랙 위도우,9.657193
개를 낳았다,9.519472


In [13]:
# 1번째 사용자
unseen_list = get_unseen(ratings, 1)
recomm = recomm_webtoon_by_userid(ratings_pred_matrix, 1, unseen_list, top_n=5)
recomm = pd.DataFrame(data=recomm.values, index=recomm.index, columns = ['pred_score'])
recomm

Unnamed: 0,pred_score
하루만 네가 되고 싶어,10.103052
1초,9.563942
모죠의 일지,9.038453
개를 낳았다,8.673515
이번 생도 잘 부탁해,8.22699


In [14]:
# 9번째 사용자
unseen_list = get_unseen(ratings, 9)
recomm = recomm_webtoon_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=5)
recomm = pd.DataFrame(data=recomm.values, index=recomm.index, columns = ['pred_score'])
recomm

Unnamed: 0,pred_score
바른연애 길잡이,8.707863
모죠의 일지,8.013259
개를 낳았다,7.793373
전지적 독자 시점,7.410536
엽총소년,7.00372
