### 4.1 Matrix Factorization (MF) 기반 추천

### 메모리기반 vs 모델 기반
- 메모리 기반 추천 시스템은 개별사용자의 데이터에 집중
- 모델 기반 추천 시스템은 소수의 사용자와 소수의 영화에 대해서만 특정한 평가 패턴이 있는 경우 잘 잡아냄(전체 데이터 고려)

#### Matrix Factorization(MF)방식의 원리  
user를 index로 columns를 가진 데이터프레임을 생각해 보자  
이 행렬을 사용자행렬과 아이템행렬도 쪼개어 분석하는 것이 MF방식이다.  
P는 각 사용자의 특성을 나타내는 K개의 요인의 값으로 이루어진 행렬, Q는 각 아이템의 특성을 나타내는 K개의 요인의 값으로 이루어진 행렬이다.  
간단한 예로 K가 2(잠재요인이 2개)인 영화의 예를 살펴보겠다. 실제로 잠재특성은 훨씬 복잡하지만 이해의 편의를 위한 것이다.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

- 사용자 요인 P

In [2]:
user_lf_p=pd.DataFrame(np.array([[-0.43,0.21],[0.31,0.92],[0.69,-0.03],[0.46,-0.30]]),index=['Bob','Sue','Mary','Alice'],columns=['액션-드라마','판타지-사실주의'])
user_lf_p

Unnamed: 0,액션-드라마,판타지-사실주의
Bob,-0.43,0.21
Sue,0.31,0.92
Mary,0.69,-0.03
Alice,0.46,-0.3


- 아이템 요인 Q

In [3]:
item_lf_q=pd.DataFrame(np.array([[0.31,0.60],[0.61,-0.82],[-0.38,-0.61],[-0.79,0.08]]),index=['기생충','겨울왕국','부산행','백두산'],columns=['액션-드라마','판타지-사실주의'])
item_lf_q

Unnamed: 0,액션-드라마,판타지-사실주의
기생충,0.31,0.6
겨울왕국,0.61,-0.82
부산행,-0.38,-0.61
백두산,-0.79,0.08


    위의 예를 보면 액션과 드라마 중에 드라마에 가까우면 1 액션에 가까우면 -1로 각 유저나 영화의 특성을 분류할 수 있다.
    그러므로 Alice는 드라마와 판타지를 선호하고, 이는 겨울왕국이 해당되는 것을 알 수 있다. 부산행도 판타지의 성향이 있으나  
    액션에 가까우므로 제외되었다.

In [4]:
movie_pqt=np.matmul(user_lf_p.to_numpy(),item_lf_q.T.to_numpy())
movie_pqt

array([[-0.0073, -0.4345,  0.0353,  0.3565],
       [ 0.6481, -0.5653, -0.679 , -0.1713],
       [ 0.1959,  0.4455, -0.2439, -0.5475],
       [-0.0374,  0.5266,  0.0082, -0.3874]])

In [5]:
movie_pqt=pd.DataFrame(movie_pqt,columns=item_lf_q.index,index=user_lf_p.index)

- 각 사용자별 추천 영화와 예측 평점은 다음과 같다.

In [6]:
movie_pqt.idxmax(1)

Bob       백두산
Sue       기생충
Mary     겨울왕국
Alice    겨울왕국
dtype: object

In [7]:
movie_pqt

Unnamed: 0,기생충,겨울왕국,부산행,백두산
Bob,-0.0073,-0.4345,0.0353,0.3565
Sue,0.6481,-0.5653,-0.679,-0.1713
Mary,0.1959,0.4455,-0.2439,-0.5475
Alice,-0.0374,0.5266,0.0082,-0.3874


### 4.2 SGD(Stochastic Gradient Decent:확률적 경사하강법)를 이용한 MF 알고리즘

In [8]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', names=r_cols,  sep='\t',encoding='latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

- Matrix Factorization Class

In [9]:
class MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

    # Root Mean Squared Error (RMSE) 계산
    def rmse(self):
        xs, ys = self.R.nonzero()
        # 0이 아닌 요소의 인덱스를 가져온다.
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            # 사용자 x, 아이템y에 대해서 평점 예측치를 get_prediction() 함수를 사용해서 계산한다. 
            # get_prediction()함수는 아래에 설명이 있다.
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
            # 실제값과 예측값의 차이를 계산해서 오차값 리스트에 추가한다.
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        # 리스트를 만들어서 평점 예측값과 오차값을 넣을 리스트를 만들고 이후에는 둘 다 변수를 초기화 한다.
        return np.sqrt(np.mean(self.errors**2))
    
    
        #errors를 사용해서 RMSE를 계산한다.
    def train(self): 
        # Initializing user-feature and item-feature matrix
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        # P행렬을 임의의 값으로 채운다. 여기서는 평균 0, 표준편차 1/K인 정규분포를 갖는 난수로 초기화한다.
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        # Q행렬을 같은 방식으로 초기화한다.

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        # 사용자 평가경향 bu를 0으로 초기화한다. 
        self.b_d = np.zeros(self.num_items)
         # 아이템 평가경향 bd를 0으로 초기화한다.
        self.b = np.mean(self.R[self.R.nonzero()])
        # 전체평균 b를 구해서 저장한다.

        # List of training samples
        rows, columns = self.R.nonzero()
        # 평점행렬 R중에서 평점이 있는(0이 아닌) 요소의 인덱스를 가져온다.
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]
        #sgd를 적용할 대상, 즉 평점이 있는 요소의 인덱스와 평점을 리스트로 만들어서 samples에 저장한다.

        # 주어진 반복 횟수에서의 확률적 경사 하강법 
        training_process = []
        # 여기에는 sgd를 한 번 실행할 떄마다 rmse가 얼마나 개선되는지를 기록한다.
        
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            # sample을 임의로 섞는다. 섞는 이유는 다른 기계학습 알고리즘과 마찬가지로 sgd를 어디서 시작
            # 하느냐에 따라 수렴의 속도가 달라질 수 있기 때문에 매 반복마다 다양한 시작점에서 출발하기 위함이다.
            self.sgd()
            rmse = self.rmse()
            # sgd로 P,Q,bd,bu가 업데이트되었으므로 이에 따른 새로운 RMSE를 계산한다.
            training_process.append((i+1, rmse))
             # 결과를 저장한다.
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f " % (i+1, rmse))
        return training_process

    # 유저 i 와 아이템 j의 평점 예측
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # P와 Q matrix를 최적화 하기 위한 sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

- 전체 데이터 사용 MF

In [10]:
R_temp = ratings.pivot(index='user_id',columns='movie_id',values='rating').fillna(0)

In [11]:
mf = MF(R_temp,K=30,alpha=0.001,beta=0.002,iterations=100,verbose=True)
train_process = mf.train()

Iteration: 10 ; Train RMSE = 0.9580 
Iteration: 20 ; Train RMSE = 0.9367 
Iteration: 30 ; Train RMSE = 0.9272 
Iteration: 40 ; Train RMSE = 0.9213 
Iteration: 50 ; Train RMSE = 0.9164 
Iteration: 60 ; Train RMSE = 0.9113 
Iteration: 70 ; Train RMSE = 0.9044 
Iteration: 80 ; Train RMSE = 0.8942 
Iteration: 90 ; Train RMSE = 0.8797 
Iteration: 100 ; Train RMSE = 0.8614 


In [12]:
from sklearn.utils import shuffle
TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]
ratings = ratings.pivot(index='user_id',columns='movie_id',values='rating').fillna(0)
# ratings를 full matrix로 변환한다.

In [13]:
ratings

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
##### >>>>> (2) user_id, item_id를 R의 index와 매핑하기 위한 dictionary 생성
        item_id_index = [] # 변수를 초기화한다.
        index_item_id = [] # 변수를 초기화한다.
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index) # 만들어진 맵을 dictionary 형태로 변환
        self.index_item_id = dict(index_item_id) # 만들어진 맵을 dictionary 형태로 변환       
        user_id_index = []
        index_user_id = []
        # 유저 아이디에 대해서도 같은 방식을 실행한다,
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

    # train set의 RMSE 계산
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Ratings for user i and item j
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # P와 Q matrix를 최적화 하기 위한 sgd
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

##### >>>>> (3)
    # Test set을 선정
    # 분리된 test set을 넘겨 받아서 클래스 내부의 test set을 만드는 함수이다.
    def set_test(self, ratings_test):
        test_set = []
        #test set에 해당되는 (사용자-아이템-평점)을 저장하는 test_set을 만든다.
        
        for i in range(len(ratings_test)):      # test 데이터에 있는 각 데이터에 대해서
            # 테스트 데이터에서 각 (사용자-아이템-평점)에 대해서 아래 작업을 반복한다.
            x = self.user_id_index[ratings_test.iloc[i, 0]]
            # 현재 사용자의 인덱스를 user_id_index에서 받아온다.
            y = self.item_id_index[ratings_test.iloc[i, 1]]
            # 현재 아이템의 인덱스를 item_id_index에서 받아온다.
            z = ratings_test.iloc[i, 2]
            # 현재 사용자-아이템의 평점을 받아온다.
            test_set.append([x, y, z])
            # 현재 (사용자-아이템-평점)을 test_set에 추가한다.
            
            self.R[x, y] = 0                    # Setting test set ratings to 0
            # 해당 (사용자-아이템-평점)을 R에서 0으로 지운다. 지우는 이유는 R을 사용해서 MF모델을 학습을 하기 떄문에 
            # test set은 R에서 제거해야 되기 때문이다. 
        self.test_set = test_set
        # 
        return test_set                         # Return test set

    # Test set의 RMSE 계산
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # Training 하면서 test set의 정확도를 계산
    def test(self):
        # Initializing user-feature and item-feature matrix
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # List of training samples
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i+1, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, rmse1, rmse2))
        return training_process

    # Ratings for given user_id and item_id
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])

    # Full user-movie rating matrix
    def full_prediction(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.P.dot(self.Q.T)        

In [15]:
ratings = pd.read_csv('u.data', names=r_cols,  sep='\t',encoding='latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

In [16]:
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = NEW_MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration: 10 ; Train RMSE = 0.9659 ; Test RMSE = 0.9834
Iteration: 20 ; Train RMSE = 0.9410 ; Test RMSE = 0.9645
Iteration: 30 ; Train RMSE = 0.9298 ; Test RMSE = 0.9567
Iteration: 40 ; Train RMSE = 0.9231 ; Test RMSE = 0.9524
Iteration: 50 ; Train RMSE = 0.9183 ; Test RMSE = 0.9498
Iteration: 60 ; Train RMSE = 0.9145 ; Test RMSE = 0.9479
Iteration: 70 ; Train RMSE = 0.9108 ; Test RMSE = 0.9466
Iteration: 80 ; Train RMSE = 0.9068 ; Test RMSE = 0.9454
Iteration: 90 ; Train RMSE = 0.9021 ; Test RMSE = 0.9441
Iteration: 100 ; Train RMSE = 0.8959 ; Test RMSE = 0.9425


In [17]:
# Printing predictions
print(mf.full_prediction())
print(mf.get_one_prediction(1, 2))

[[3.79007103 3.39251954 3.07727542 ... 3.33614419 3.50401006 3.43075281]
 [3.92238783 3.49327401 3.16119464 ... 3.43670578 3.55562911 3.55101185]
 [3.34510174 2.88696539 2.56720044 ... 2.81462818 2.93463325 2.93942045]
 ...
 [4.23843903 3.77186417 3.45094058 ... 3.70550808 3.81425317 3.83911717]
 [4.33655315 3.89445676 3.57477393 ... 3.83245879 3.96083046 3.95783339]
 [3.73912763 3.34258342 2.99596948 ... 3.27987594 3.42591935 3.40701989]]
3.3925195366200676
