In [140]:
# 행렬분해, 경사하강법(딥러닝) 을 이용한 알고리즘

# 행렬분해 - matrix를 두 개의 서로 다른 행렬로 분해하는 것. 
#            분해된 행렬은 원래 행렬의 내재된 다른 의미를 갖는 행렬로 변환됨.
# 
# 경사하강법 - 함수의 값을 최소화하는 파라미터들을 찾는 방식
#                f(x) = w1x1 + w2x2 + ... => f(x)를 최소화하는 w1, w2 ...를 찾는 방법
#                f(x)를 w1으로 미분한 값을 w1에 더해서 업데이트..
#                f(x)를 w2로 미분한 값을 w2에 더해서 업데이트..

In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [142]:
# 경사하강법을 이용한 행렬분해

In [143]:
# 사용자 영화 평점 매트릭스

R = np.array(
[
    [4, np.nan, np.nan, 2, np.nan],
     [np.nan, 5, np.nan, 3, 1],
     [np.nan, np.nan, 3, 4, 4],
     [5, 2, 1, 2, np.nan],
]
)
R

array([[ 4., nan, nan,  2., nan],
       [nan,  5., nan,  3.,  1.],
       [nan, nan,  3.,  4.,  4.],
       [ 5.,  2.,  1.,  2., nan]])

In [144]:
num_users, num_items = R.shape
print(num_users, num_items)  # 이용자수, 영화수

4 5


In [145]:
k = 3

In [146]:
# a*b = c 
# a의 row원소수, b의 칼럼 수가 같아야..
# c의 shape은 (a row num, b col num)

In [147]:
# R = P * Q.T

In [148]:
np.random.normal(size=(10, 10))

array([[-0.61803685, -2.03720123, -1.94258918, -2.50644065, -2.11416392,
        -0.41163916,  1.27852808, -0.44222928,  0.32352735, -0.10999149],
       [ 0.00854895, -0.16819884, -0.17418034,  0.4611641 , -1.17598267,
         1.01012718,  0.92001793, -0.19505734,  0.80539342, -0.70134443],
       [-0.53722302,  0.15626385, -0.19022103, -0.44873803, -0.67244804,
        -0.55749472,  0.93916874, -1.94332341,  0.35249436, -0.23643695],
       [ 0.7278135 ,  0.51507361, -2.78253447,  0.58464661,  0.32427424,
         0.02186284, -0.46867382,  0.85328122, -0.41302931,  1.83471763],
       [ 0.56438286,  2.13782807, -0.785534  , -1.75592564,  0.7147896 ,
         0.85270406,  0.0353601 , -1.53879325, -0.44789518,  0.61798553],
       [-0.18417633, -0.11598519, -0.17545897, -0.93391466, -0.53302033,
        -1.42655542,  1.76795995, -0.47537288,  0.47761018, -1.02188594],
       [ 0.79452824, -1.87316098,  0.92061512, -0.03536792,  2.11060505,
        -1.30653407,  0.07638048,  0.36723181

In [149]:
np.random.seed(1)

P = np.random.normal(scale=1/k, size=(num_users, k))
Q = np.random.normal(scale=1/k, size=(num_items, k))

In [171]:
P

array([[ 0.96176346,  0.47294178, -0.77523415],
       [-0.02715877,  0.84126576, -2.46684544],
       [ 2.38443343,  0.1145889 , -1.19297505],
       [ 0.58938647,  1.09569803, -1.06290344]])

In [172]:
Q.T

array([[ 1.61880439, -0.7857943 ,  1.04170897,  1.07217002,  1.43467753],
       [ 1.51366549,  0.45367747, -0.05649678,  0.12022894, -0.21006598],
       [-2.2158992 , -1.85456412, -0.42739712, -1.17909076, -0.49396305]])

In [166]:
np.dot(P, Q.T)

array([[3.99062329, 0.89653623, 1.30649077, 2.00210666, 1.66340846],
       [6.69571106, 4.97792757, 0.97850229, 2.98066034, 1.0028451 ],
       [6.67689303, 0.39076095, 2.98728588, 3.9769208 , 3.98610743],
       [4.96790858, 2.00517956, 1.00634763, 2.01691675, 1.14044567]])

In [153]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    full_pred_matrix = np.dot(P, Q.T)  # 예측행렬
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]  # non zero data 행 인덱스값
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]  # non zero data 열 인덱스값
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind] # 실제 행렬의 non zero data 실제값
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind] # 예상 행렬의 non zero data 실제값
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros) # 실제 행렬값과 예상행렬값의 차이 계산
    rmse = np.sqrt(mse)
    
    return rmse  

In [154]:
non_zeros = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]
non_zeros

[(0, 0, 4.0),
 (0, 3, 2.0),
 (1, 1, 5.0),
 (1, 3, 3.0),
 (1, 4, 1.0),
 (2, 2, 3.0),
 (2, 3, 4.0),
 (2, 4, 4.0),
 (3, 0, 5.0),
 (3, 1, 2.0),
 (3, 2, 1.0),
 (3, 3, 2.0)]

In [155]:
# 경사하강법

In [156]:
steps = 1000
learning_rate = 0.01
r_lambda = 0.01

In [157]:
steps=1000
learning_rate=0.01
r_lambda=0.01  # 규제값 크기

for step in range(steps):
    for i, j, r in non_zeros:  # 실제 행렬에서 값이 있는 인덱스, 칼럽, 값을 가져온다
        eij = r - np.dot(P[i, :], Q[j, :].T)    # P : 사용자행 벡터, Q : 아이템행 벡터, 실제값과 예상행렬의 값과의 차이
        P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])

    rmse = get_rmse(R, P, Q, non_zeros)  # non_zeros - 원본행렬에서 0값이 아닌 위치인덱스와 값
    
    if (step % 50) == 0 :
        print("### iteration step : ", step," rmse : ", rmse)

### iteration step :  0  rmse :  3.2388050277987723
### iteration step :  50  rmse :  0.4876723101369648
### iteration step :  100  rmse :  0.1564340384819247
### iteration step :  150  rmse :  0.07455141311978046
### iteration step :  200  rmse :  0.04325226798579314
### iteration step :  250  rmse :  0.029248328780878973
### iteration step :  300  rmse :  0.022621116143829466
### iteration step :  350  rmse :  0.019493636196525135
### iteration step :  400  rmse :  0.018022719092132704
### iteration step :  450  rmse :  0.01731968595344266
### iteration step :  500  rmse :  0.016973657887570753
### iteration step :  550  rmse :  0.016796804595895633
### iteration step :  600  rmse :  0.01670132290188466
### iteration step :  650  rmse :  0.01664473691247669
### iteration step :  700  rmse :  0.016605910068210026
### iteration step :  750  rmse :  0.016574200475705
### iteration step :  800  rmse :  0.01654431582921597
### iteration step :  850  rmse :  0.01651375177473524
### iterati

In [158]:
pred_matrix = np.dot(P, Q.T)  # P 와 Q 가 업데이트 되었다.
print('예측 행렬:\n', np.round(pred_matrix, 3))

예측 행렬:
 [[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


In [159]:
rray([[ 4., nan, nan,  2., nan],
       [nan,  5., nan,  3.,  1.],
       [nan, nan,  3.,  4.,  4.],
       [ 5.,  2.,  1.,  2., nan]])

NameError: name 'rray' is not defined

In [None]:
P

In [None]:
Q

In [None]:
import pickle

In [None]:
with open('ratings_matrix.pickle', 'rb')  as f:
    ratings_matrix = pickle.load(f)

In [None]:
ratings_matrix

In [None]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    full_pred_matrix = np.dot(P, Q.T)
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]  # 실제행렬값
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]  # 예축행렬값
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [None]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    
    num_users, num_items = R.shape
    
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i, :], Q[j, :].T)
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [None]:
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)
pred_matrix = np.dot(P, Q.T)

In [None]:
ratings_matrix.shape

In [None]:
pred_matrix.shape

In [None]:
ratings_pred_matrix = pd.DataFrame(pred_matrix, index = ratings_matrix.index, columns = ratings_matrix.columns)
ratings_pred_matrix.head()

In [None]:
ratings_matrix.head()

In [None]:
user_id = 9
top_n = 50

ratings_pred_matrix.loc[user_id].sort_values(ascending=False)[:top_n]

In [201]:
x = np.array([[1, 2], [2, 1]])
y = np.array([[2, 1 ,2], [1, 1, 2]])
np.dot(x, y)

array([[4, 3, 6],
       [5, 3, 6]])

In [184]:
np.dot(P, Q.T)

array([[3.99062329, 0.89653623, 1.30649077, 2.00210666, 1.66340846],
       [6.69571106, 4.97792757, 0.97850229, 2.98066034, 1.0028451 ],
       [6.67689303, 0.39076095, 2.98728588, 3.9769208 , 3.98610743],
       [4.96790858, 2.00517956, 1.00634763, 2.01691675, 1.14044567]])

In [186]:
P.shape

(4, 3)

In [187]:
Q.T.shape

(3, 5)