In [1]:
import numpy as np


class MatrixFactorization():
    def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False):
        """
        :param R: rating matrix
        :param k: latent parameter
        :param learning_rate: alpha on weight update
        :param reg_param: beta on weight update
        :param epochs: training epochs
        :param verbose: print status
        """

        self._R = R
        self._num_users, self._num_items = R.shape
        self._k = k
        self._learning_rate = learning_rate
        self._reg_param = reg_param
        self._epochs = epochs
        self._verbose = verbose


    def fit(self):
        """
        training Matrix Factorization : Update matrix latent weight and bias

        참고: self._b에 대한 설명
        - global bias: input R에서 평가가 매겨진 rating의 평균값을 global bias로 사용
        - 정규화 기능. 최종 rating에 음수가 들어가는 것 대신 latent feature에 음수가 포함되도록 해줌.

        :return: training_process
        """

        # init latent features
        self._P = np.random.normal(size=(self._num_users, self._k))
        self._Q = np.random.normal(size=(self._num_items, self._k))

        # init biases
        self._b_P = np.zeros(self._num_users)
        self._b_Q = np.zeros(self._num_items)
        self._b = np.mean(self._R[np.where(self._R != 0)])

        # train while epochs
        self._training_process = []
        for epoch in range(self._epochs):

            # rating이 존재하는 index를 기준으로 training
            for i in range(self._num_users):
                for j in range(self._num_items):
                    if self._R[i, j] > 0:
                        self.gradient_descent(i, j, self._R[i, j])
            cost = self.cost()
            self._training_process.append((epoch, cost))

            # print status
            if self._verbose == True and ((epoch + 1) % 10 == 0):
                print("Iteration: %d ; cost = %.4f" % (epoch + 1, cost))


    def cost(self):
        """
        compute root mean square error
        :return: rmse cost
        """

        # xi, yi: R[xi, yi]는 nonzero인 value를 의미한다.
        # 참고: http://codepractice.tistory.com/90
        xi, yi = self._R.nonzero()
        predicted = self.get_complete_matrix()
        cost = 0
        for x, y in zip(xi, yi):
            cost += pow(self._R[x, y] - predicted[x, y], 2)
        return np.sqrt(cost) / len(xi)


    def gradient(self, error, i, j):
        """
        gradient of latent feature for GD

        :param error: rating - prediction error
        :param i: user index
        :param j: item index
        :return: gradient of latent feature tuple
        """

        dp = (error * self._Q[j, :]) - (self._reg_param * self._P[i, :])
        dq = (error * self._P[i, :]) - (self._reg_param * self._Q[j, :])
        return dp, dq


    def gradient_descent(self, i, j, rating):
        """
        graident descent function

        :param i: user index of matrix
        :param j: item index of matrix
        :param rating: rating of (i,j)
        """

        # get error
        prediction = self.get_prediction(i, j)
        error = rating - prediction

        # update biases
        self._b_P[i] += self._learning_rate * (error - self._reg_param * self._b_P[i])
        self._b_Q[j] += self._learning_rate * (error - self._reg_param * self._b_Q[j])

        # update latent feature
        dp, dq = self.gradient(error, i, j)
        self._P[i, :] += self._learning_rate * dp
        self._Q[j, :] += self._learning_rate * dq


    def get_prediction(self, i, j):
        """
        get predicted rating: user_i, item_j
        :return: prediction of r_ij
        """
        return self._b + self._b_P[i] + self._b_Q[j] + self._P[i, :].dot(self._Q[j, :].T)


    def get_complete_matrix(self):
        """
        computer complete matrix PXQ + P.bias + Q.bias + global bias

        - PXQ 행렬에 b_P[:, np.newaxis]를 더하는 것은 각 열마다 bias를 더해주는 것
        - b_Q[np.newaxis:, ]를 더하는 것은 각 행마다 bias를 더해주는 것
        - b를 더하는 것은 각 element마다 bias를 더해주는 것

        - newaxis: 차원을 추가해줌. 1차원인 Latent들로 2차원의 R에 행/열 단위 연산을 해주기위해 차원을 추가하는 것.

        :return: complete matrix R^
        """
        return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis:, ] + self._P.dot(self._Q.T)


    def print_results(self):
        """
        print fit results
        """

        print("User Latent P:")
        print(self._P)
        print("Item Latent Q:")
        print(self._Q.T)
        print("P x Q:")
        print(self._P.dot(self._Q.T))
        print("bias:")
        print(self._b)
        print("User Latent bias:")
        print(self._b_P)
        print("Item Latent bias:")
        print(self._b_Q)
        print("Final R matrix:")
        print(self.get_complete_matrix())
        print("Final RMSE:")
        print(self._training_process[self._epochs-1][1])

In [2]:
import numpy as np
import pandas as pd

ratings_data = pd.read_csv('C:\\Users\\multicampus\\BigdataPrac\\data_ratings.dat', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')

In [3]:
del ratings_data['timestamp']

In [4]:
ratings_data

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [5]:
R = ratings_data.set_index(['user_id', 'movie_id']).unstack()

In [28]:
R_copy = ratings_data.set_index(['user_id', 'movie_id']).unstack()

In [111]:
user_cnt = R.shape[0]
movie_cnt = R.shape[1]
print(user_cnt, movie_cnt)

6040 3706


In [34]:
# R_copy

In [6]:
R = R.fillna(0)

In [7]:
R = R.values

In [8]:
factorizer = MatrixFactorization(R, k=100, learning_rate=0.01, reg_param=0.01, epochs=200, verbose=True)

In [9]:
%%time

factorizer.fit()

Iteration: 10 ; cost = 0.0008
Iteration: 20 ; cost = 0.0007
Iteration: 30 ; cost = 0.0006
Iteration: 40 ; cost = 0.0006
Iteration: 50 ; cost = 0.0005
Iteration: 60 ; cost = 0.0005
Iteration: 70 ; cost = 0.0005
Iteration: 80 ; cost = 0.0005
Iteration: 90 ; cost = 0.0005
Iteration: 100 ; cost = 0.0004
Iteration: 110 ; cost = 0.0004
Iteration: 120 ; cost = 0.0004
Iteration: 130 ; cost = 0.0004
Iteration: 140 ; cost = 0.0004
Iteration: 150 ; cost = 0.0004
Iteration: 160 ; cost = 0.0004
Iteration: 170 ; cost = 0.0004
Iteration: 180 ; cost = 0.0004
Iteration: 190 ; cost = 0.0004
Iteration: 200 ; cost = 0.0004
Wall time: 1h 30min 39s


In [10]:
factorizer.print_results()

User Latent P:
[[ 0.07411271  0.65265781  0.15297425 ... -0.26981768  0.4293396
  -0.22440656]
 [-0.11946675  0.01752697  0.07997298 ...  0.23062905 -0.56224651
  -0.07343287]
 [ 0.94488104  0.05780644  0.12441565 ...  0.65964921 -0.58166792
   0.45438793]
 ...
 [-0.10124685 -0.37163765  0.41706112 ... -0.54483068 -1.06956702
   0.05810076]
 [ 0.12965354  0.2291794   0.1246555  ... -0.05472163  0.21782902
  -0.02974145]
 [ 0.01532384  0.2043657  -0.01232908 ... -0.05430694 -0.03434418
  -0.16395204]]
Item Latent Q:
[[ 0.15750456 -0.16970222  0.0451356  ...  0.12523857 -0.31840572
  -0.17043988]
 [ 0.00624281  0.13546461 -0.47722879 ...  0.29569755 -0.50575226
   0.08161755]
 [ 0.20284788 -0.02262183  0.67987306 ... -0.0884303  -0.02171559
   0.00608657]
 ...
 [-0.12499009  0.09978715  0.08065375 ...  0.28058859 -0.48937023
  -0.16660872]
 [-0.09493838  0.57884383  0.00726823 ... -0.12744537 -0.69047766
   0.34948615]
 [-0.11023807 -0.35994384 -0.7251919  ...  0.05693977 -0.37311892
   

In [11]:
factorizer.get_complete_matrix()

array([[ 4.75554696,  2.03381364,  4.21392188, ...,  3.33319789,
         3.69645722,  5.39586915],
       [ 3.33819677,  2.73019851,  2.92524088, ...,  7.42435586,
         5.12155964,  3.62691949],
       [ 4.1258043 ,  1.96779165,  3.44032126, ...,  4.2437758 ,
        -0.51237948,  3.00041842],
       ...,
       [ 4.70722919,  0.24529419,  2.25393965, ...,  2.36193579,
         4.38597198,  3.8190227 ],
       [ 4.34584649,  3.42423284,  3.70352119, ...,  4.06186727,
         3.00661404,  4.46684637],
       [ 3.79491395,  2.38392036,  1.6943491 , ...,  3.58165858,
         3.38964733,  3.23603841]])

In [12]:
factorizer.get_complete_matrix().shape

(6040, 3706)

In [16]:
# np.savetxt('matrix_factorization_50.csv', factorizer.get_complete_matrix(), delimiter=',')

In [17]:
# np.savetxt('matrix_factorization_100.csv', factorizer.get_complete_matrix(), delimiter=',')

In [18]:
# np.savetxt('matrix_factorization_50_200.csv', factorizer.get_complete_matrix(), delimiter=',')

In [20]:
np.savetxt('matrix_factorization_100_200.csv', factorizer.get_complete_matrix(), delimiter=',')

In [130]:
R_hat = factorizer.get_complete_matrix()

In [131]:
R_hat = pd.DataFrame(R_hat)

In [178]:
R_hat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,4.755547,2.033814,4.213922,3.394411,3.475659,3.930883,3.125256,4.148436,3.552106,3.234247,...,2.428989,2.722257,1.391781,3.819924,2.236778,3.379428,3.411944,3.333198,3.696457,5.395869
1,3.338197,2.730199,2.925241,3.710608,2.146505,4.466167,4.600624,3.333255,2.852481,2.788075,...,4.163182,-1.587802,2.753048,1.912184,0.369491,3.976831,2.479168,7.424356,5.121560,3.626919
2,4.125804,1.967792,3.440321,1.016051,3.048603,4.638581,3.880037,2.977824,2.051205,4.302249,...,2.746396,0.924237,-0.559637,0.148119,1.857750,3.741147,2.069795,4.243776,-0.512379,3.000418
3,5.246838,4.282581,3.744931,3.252937,5.215523,3.975849,5.673020,3.621159,7.427757,5.159601,...,2.177891,3.999873,-3.225461,2.720257,5.876501,7.159135,7.514503,6.735638,7.660950,3.250176
4,3.700610,2.634328,1.275880,1.380742,0.584958,2.484456,0.567401,2.134120,1.900765,2.119476,...,2.307924,1.093768,1.385918,0.104831,1.921875,2.104504,3.375815,3.694726,1.342892,2.152543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,3.985724,2.810716,2.605302,2.423510,2.999838,3.197214,1.965671,2.379652,1.467432,2.793067,...,2.507704,3.055538,0.855658,1.794564,4.380316,3.426443,4.304635,3.433445,3.917901,3.015373
6036,4.056405,3.457767,3.621037,3.409232,3.410353,3.970278,3.223135,3.361843,3.277914,2.674236,...,3.109017,3.343487,4.175677,2.017482,3.148660,3.622418,2.954660,3.232084,4.042181,3.810769
6037,4.707229,0.245294,2.253940,0.345641,1.881514,3.814959,5.092170,5.449143,1.605070,3.822454,...,3.128218,-2.082497,-1.265217,-2.368749,5.252848,1.576240,2.284901,2.361936,4.385972,3.819023
6038,4.345846,3.424233,3.703521,2.921033,3.293632,3.985757,4.136858,2.572844,3.595326,3.245987,...,3.686758,-0.927041,0.675161,2.534309,2.869840,3.359357,4.095073,4.061867,3.006614,4.466846


In [179]:
# index로 읽을 때는 user_id에서 1 빼서
user_rec_list = pd.DataFrame()
user_rec_list['user_id'] = np.arange(user_cnt+1)
user_rec_list['movie_ids'] = R_copy.rating.apply(lambda row : np.array(row.dropna(axis=0).index), axis=1)

In [180]:
user_rec_list = user_rec_list.drop(0,axis=0)

In [181]:
user_rec_list = user_rec_list.set_index("user_id")

In [182]:
user_rec_list.movie_ids[1]

array([   1,   48,  150,  260,  527,  531,  588,  594,  595,  608,  661,
        720,  745,  783,  914,  919,  938, 1022, 1028, 1029, 1035, 1097,
       1193, 1197, 1207, 1246, 1270, 1287, 1545, 1566, 1721, 1836, 1907,
       1961, 1962, 2018, 2028, 2294, 2321, 2340, 2355, 2398, 2687, 2692,
       2762, 2791, 2797, 2804, 2918, 3105, 3114, 3186, 3408], dtype=int64)

In [None]:
# np.array(R_hat.iloc[0].sort_values(ascending=False).index)

In [183]:
%%time
from collections import deque

for i in range(user_cnt):
    movie_all = R_hat.iloc[i].to_dict()
#     print(len(movie_all))
    user_movie = user_rec_list.movie_ids[i+1]
    
    for m in user_movie:
        if m in movie_all.keys():
            del movie_all[m]
    
    user_rec_list.movie_ids[i+1] = deque()
    movie_all = sorted(movie_all.items(), key=lambda x:x[1], reverse=True)
    movie_all = movie_all[:100]
    
    for item in movie_all:
        user_rec_list.movie_ids[i+1].append(item[0])

Wall time: 27.8 s


In [184]:
user_rec_list

Unnamed: 0_level_0,movie_ids
user_id,Unnamed: 1_level_1
1,"[3658, 36, 161, 1266, 718, 219, 125, 390, 732,..."
2,"[1359, 2688, 2035, 650, 2421, 3361, 3363, 1349..."
3,"[3109, 1077, 2394, 1220, 864, 3644, 1975, 2165..."
4,"[922, 2481, 791, 3069, 720, 2706, 3424, 693, 1..."
5,"[682, 1626, 1056, 937, 2032, 3021, 636, 1001, ..."
...,...
6036,"[809, 820, 1221, 2387, 974, 918, 3009, 1616, 2..."
6037,"[1691, 592, 775, 1026, 3124, 2787, 546, 828, 2..."
6038,"[2924, 2868, 791, 3062, 3089, 3144, 748, 1062,..."
6039,"[565, 3557, 2982, 1448, 2843, 1663, 650, 2293,..."


In [118]:
# R_copy.rating.apply(lambda row : np.array(row.dropna(axis=0).index), axis=1)

In [101]:
# R_copy.rating.iloc[0].dropna().index.values

In [119]:
# R_copy.rating.apply(lambda row : np.array(row.dropna(axis=0).index), axis=1)

In [89]:
aa = R_copy.rating.iloc[0].dropna().index.values

In [138]:
R_hat.iloc[0].sort_values(ascending=False).to_dict
# R_hat.apply(lambda row : np.array(row), axis=1)

<bound method Series.to_dict of 3658    11.860628
36      10.967847
161     10.302221
1266    10.073938
718      9.955167
          ...    
1723    -1.915755
186     -2.295102
3536    -2.396051
1287    -2.596076
1710    -2.696872
Name: 0, Length: 3706, dtype: float64>

In [None]:
R_hat.

In [92]:
# 차집합
np.setdiff1d(bb, aa)

array([   0,    2,    3, ..., 3703, 3704, 3705], dtype=int64)

In [None]:
# user_id에 맞게 저장하기
# {user_id : [], user_id : [] , ... }


In [185]:
# for movie_id in np.nditer(R_copy.rating.iloc[0].dropna().index.values):
#     print(movie_id)

In [48]:
# np.argwhere(np.isnan(x))
np.argwhere(np.isnan(R_copy.iloc[0]))[:, 0]

array([   1,    2,    3, ..., 3703, 3704, 3705], dtype=int64)