- R : M x n  행렬  <br>$$U\sum V^T$$

- 축소된 데이터를 활용하여 해당 사용자가 평가하지 않은 item에 대한 값을 예측하여 채워넣는 방식
- RMSE ( R 과 특이값분해 수식 간의 차이)를 통해 평가하고 개선

In [9]:
import numpy as np
from sklearn.metrics import mean_squared_error
import dataset

data = dataset.Dataset()
train = data.train
test = data.test
# R = np.array([[4, np.NaN, np.NaN, 2, np.NaN],
#               [np.NaN, 5, np.NaN, 3, 1],
#               [np.NaN, np.NaN, 3, 4, 4],
#               [5, 2, 1, 2, np.NaN]])

train

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [2]:
num_users, num_items = train.shape

In [3]:
P = np.random.normal(scale=1.0/3, size=(num_users, 2))
Q = np.random.normal(scale=1.0/3, size=(num_items, 2))
P, Q.T

(array([[ 0.67562969,  0.00440368],
        [ 0.23826787, -0.08284665],
        [ 0.35091276,  0.07118863],
        ...,
        [ 0.3124937 , -0.10394678],
        [ 0.45936153,  0.10838712],
        [-0.10685868, -0.08314122]]),
 array([[-0.04946614, -0.25048229, -0.57601247, ...,  0.1524334 ,
         -0.28223697, -0.12594917],
        [ 0.32539987, -0.74600221,  0.04396141, ...,  0.07833713,
         -0.20062239, -0.13682047]]))

In [7]:
from timeit import default_timer as Timer


# 실제 R 행렬과 예측 행렬의 오차를 구하는 함수
def calculate_rmse(R, P, Q, non_zeros):
    error = 0

    full_pred_matrix = np.dot(P, Q.T)

    # 여기서 non_zeros는 아래 함수에서 확인할 수 있다.
    x_non_zero_ind = [non_zeros[0] for non_zeros in non_zeros]
    y_non_zero_ind = [non_zeros[1] for non_zeros in non_zeros]

    # 원 행렬 R에서 0이 아닌 값들만 추출한다.
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]

    # 예측 행렬에서 원 행렬 R에서 0이 아닌 위치의 값들만 추출하여 저장한다.
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]

    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)

    return rmse


def matrix_factorization(R, K, epochs=200, learning_rate=0.01, r_lambda=0.01):
    num_users, num_items = R.shape

    np.random.seed(1)
    P = np.random.normal(scale=1.0/K, size=(num_users, K))
    Q = np.random.normal(scale=1.0/K, size=(num_items, K))

    # R>0인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장한다.
    non_zeros = [ (i, j, R[i, j]) for i in range(num_users)
                  for j in range(num_items) if R[i, j] > 0 ]

    # SGD 기법으로 P, Q 매트릭스를 업데이트 함
    start = Timer()
    for epoch in range(epochs):
        for i, j, r in non_zeros:
            # 잔차 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)

            # Regulation을 반영한 SGD 업데이터 적용
            P[i, :] = P[i, :] + learning_rate*(eij * Q[j, :] - r_lambda*P[i, :])
            Q[j, :] = Q[j, :] + learning_rate*(eij * P[i, :] - r_lambda*Q[j, :])

        rmse = calculate_rmse(R, P, Q, non_zeros)
        print("iteration: {0}, rmse: {1:4f}".format(epoch+1, rmse))
    print("time : %.4f sec" % (Timer() - start) )
    return P, Q

P, Q = matrix_factorization(train, K=3)

iteration: 1, rmse: 2.458689
iteration: 2, rmse: 1.167944
iteration: 3, rmse: 1.025404
iteration: 4, rmse: 0.982318
iteration: 5, rmse: 0.958670
iteration: 6, rmse: 0.942885
iteration: 7, rmse: 0.931050
iteration: 8, rmse: 0.921704
iteration: 9, rmse: 0.914135
iteration: 10, rmse: 0.907888
iteration: 11, rmse: 0.902647
iteration: 12, rmse: 0.898183
iteration: 13, rmse: 0.894333
iteration: 14, rmse: 0.890978
iteration: 15, rmse: 0.888027
iteration: 16, rmse: 0.885413
iteration: 17, rmse: 0.883081
iteration: 18, rmse: 0.880988
iteration: 19, rmse: 0.879102
iteration: 20, rmse: 0.877394
iteration: 21, rmse: 0.875840
iteration: 22, rmse: 0.874423
iteration: 23, rmse: 0.873125
iteration: 24, rmse: 0.871933
iteration: 25, rmse: 0.870835
iteration: 26, rmse: 0.869821
iteration: 27, rmse: 0.868883
iteration: 28, rmse: 0.868012
iteration: 29, rmse: 0.867202
iteration: 30, rmse: 0.866448
iteration: 31, rmse: 0.865745
iteration: 32, rmse: 0.865087
iteration: 33, rmse: 0.864471
iteration: 34, rmse

In [8]:
pred_matrix = np.dot(P, Q.T)
print(pred_matrix)

[[3.67347365 3.06902771 2.85075282 ... 1.26976909 3.52520884 3.10706821]
 [3.91728923 3.24960057 2.39614183 ... 1.59080193 3.60576889 3.02842483]
 [3.25980842 2.84034745 3.06312458 ... 0.90169318 3.02154634 2.8093167 ]
 ...
 [4.07678897 3.47331502 3.18299862 ... 1.38236724 3.75745869 3.33137258]
 [4.55635861 4.12289109 3.43730526 ... 1.51653082 3.58418456 3.20871579]
 [3.53604488 3.28832294 4.08346133 ... 0.64506516 3.02850401 3.04551248]]
