### 4.3 MF의 최적 파라미터 찾기

iterations가 일정 수를 넘어가면 train set에 지나치게 맞춰져서 과적합이생긴다. 잠재요인의 수인 K가 지나치게 커지는 경우도 과적합이 발생한다.

In [1]:
import numpy as np
import pandas as pd

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', names=r_cols,  sep='\t',encoding='latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)            # timestamp 제거

- train test 분리

In [2]:
from sklearn.utils import shuffle
TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

- TRAIN과 TEST데이터에 대한 새로운 MF 클래스 

In [12]:
class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
##### >>>>> (2) user_id, item_id를 R의 index와 매핑하기 위한 dictionary 생성
        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
#### <<<<< (2)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

    # train set의 RMSE 계산
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # 유저(i)와 아이템(j)의 Ratings
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # P 와 Q matrix를 최적화하기 위한 확률적 경사 하강법
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

##### >>>>> (3)
    # Test set을 선정
    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):      # test 데이터에 있는 각 데이터에 대해서
            x = self.user_id_index[ratings_test.iloc[i, 0]]
            y = self.item_id_index[ratings_test.iloc[i, 1]]
            z = ratings_test.iloc[i, 2]
            test_set.append([x, y, z])
            self.R[x, y] = 0                    # Setting test set ratings to 0
        self.test_set = test_set
        return test_set                         # Return test set

    # Test set의 RMSE 계산
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    # Training 하면서 test set의 정확도를 계산
    def test(self):
        # user-feature 와 item-feature matrix 설정
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # 사용자 평가 경향 설정
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # 평점이 있는 요소의 인덱스와 평점을 리스트로 만들어서 samples에 저장
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]

        # 주어진 반복 횟수에 따라 확률적경사하강법 수행
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i+1, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, rmse1, rmse2))
        return training_process

    # 주어진 유저와 영화에 대한 평가 예측
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])

    # Full 유저-영화 평가 matrix
    def full_prediction(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.P.dot(self.Q.T)

- 최적의 K값 찾기

In [13]:
results = []
index = []
for K in range(50, 261, 10):
    print('K =', K)
    R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
    mf = NEW_MF(R_temp, K=K, alpha=0.001, beta=0.02, iterations=300, verbose=True)
    test_set = mf.set_test(ratings_test)
    result = mf.test()
    index.append(K)
    results.append(result)

K = 50
Iteration: 10 ; Train RMSE = 0.9662 ; Test RMSE = 0.9834
Iteration: 20 ; Train RMSE = 0.9415 ; Test RMSE = 0.9645
Iteration: 30 ; Train RMSE = 0.9305 ; Test RMSE = 0.9566
Iteration: 40 ; Train RMSE = 0.9241 ; Test RMSE = 0.9524
Iteration: 50 ; Train RMSE = 0.9198 ; Test RMSE = 0.9497
Iteration: 60 ; Train RMSE = 0.9165 ; Test RMSE = 0.9479
Iteration: 70 ; Train RMSE = 0.9135 ; Test RMSE = 0.9466
Iteration: 80 ; Train RMSE = 0.9106 ; Test RMSE = 0.9455
Iteration: 90 ; Train RMSE = 0.9072 ; Test RMSE = 0.9444
Iteration: 100 ; Train RMSE = 0.9028 ; Test RMSE = 0.9431
Iteration: 110 ; Train RMSE = 0.8969 ; Test RMSE = 0.9413
Iteration: 120 ; Train RMSE = 0.8889 ; Test RMSE = 0.9388
Iteration: 130 ; Train RMSE = 0.8785 ; Test RMSE = 0.9357
Iteration: 140 ; Train RMSE = 0.8657 ; Test RMSE = 0.9321
Iteration: 150 ; Train RMSE = 0.8508 ; Test RMSE = 0.9284
Iteration: 160 ; Train RMSE = 0.8343 ; Test RMSE = 0.9250
Iteration: 170 ; Train RMSE = 0.8163 ; Test RMSE = 0.9221
Iteration: 180 ;

Iteration: 230 ; Train RMSE = 0.7144 ; Test RMSE = 0.9127
Iteration: 240 ; Train RMSE = 0.6907 ; Test RMSE = 0.9131
Iteration: 250 ; Train RMSE = 0.6669 ; Test RMSE = 0.9141
Iteration: 260 ; Train RMSE = 0.6434 ; Test RMSE = 0.9155
Iteration: 270 ; Train RMSE = 0.6202 ; Test RMSE = 0.9174
Iteration: 280 ; Train RMSE = 0.5977 ; Test RMSE = 0.9195
Iteration: 290 ; Train RMSE = 0.5758 ; Test RMSE = 0.9218
Iteration: 300 ; Train RMSE = 0.5549 ; Test RMSE = 0.9243
K = 100
Iteration: 10 ; Train RMSE = 0.9663 ; Test RMSE = 0.9834
Iteration: 20 ; Train RMSE = 0.9418 ; Test RMSE = 0.9644
Iteration: 30 ; Train RMSE = 0.9310 ; Test RMSE = 0.9566
Iteration: 40 ; Train RMSE = 0.9249 ; Test RMSE = 0.9523
Iteration: 50 ; Train RMSE = 0.9208 ; Test RMSE = 0.9497
Iteration: 60 ; Train RMSE = 0.9178 ; Test RMSE = 0.9479
Iteration: 70 ; Train RMSE = 0.9154 ; Test RMSE = 0.9467
Iteration: 80 ; Train RMSE = 0.9132 ; Test RMSE = 0.9456
Iteration: 90 ; Train RMSE = 0.9107 ; Test RMSE = 0.9447
Iteration: 100 

Iteration: 150 ; Train RMSE = 0.8789 ; Test RMSE = 0.9333
Iteration: 160 ; Train RMSE = 0.8666 ; Test RMSE = 0.9294
Iteration: 170 ; Train RMSE = 0.8527 ; Test RMSE = 0.9256
Iteration: 180 ; Train RMSE = 0.8372 ; Test RMSE = 0.9220
Iteration: 190 ; Train RMSE = 0.8203 ; Test RMSE = 0.9188
Iteration: 200 ; Train RMSE = 0.8017 ; Test RMSE = 0.9160
Iteration: 210 ; Train RMSE = 0.7815 ; Test RMSE = 0.9136
Iteration: 220 ; Train RMSE = 0.7599 ; Test RMSE = 0.9118
Iteration: 230 ; Train RMSE = 0.7371 ; Test RMSE = 0.9106
Iteration: 240 ; Train RMSE = 0.7134 ; Test RMSE = 0.9100
Iteration: 250 ; Train RMSE = 0.6892 ; Test RMSE = 0.9100
Iteration: 260 ; Train RMSE = 0.6648 ; Test RMSE = 0.9106
Iteration: 270 ; Train RMSE = 0.6405 ; Test RMSE = 0.9117
Iteration: 280 ; Train RMSE = 0.6165 ; Test RMSE = 0.9132
Iteration: 290 ; Train RMSE = 0.5931 ; Test RMSE = 0.9150
Iteration: 300 ; Train RMSE = 0.5703 ; Test RMSE = 0.9169
K = 150
Iteration: 10 ; Train RMSE = 0.9664 ; Test RMSE = 0.9834
Iterati

Iteration: 70 ; Train RMSE = 0.9165 ; Test RMSE = 0.9468
Iteration: 80 ; Train RMSE = 0.9147 ; Test RMSE = 0.9459
Iteration: 90 ; Train RMSE = 0.9130 ; Test RMSE = 0.9451
Iteration: 100 ; Train RMSE = 0.9111 ; Test RMSE = 0.9444
Iteration: 110 ; Train RMSE = 0.9088 ; Test RMSE = 0.9435
Iteration: 120 ; Train RMSE = 0.9056 ; Test RMSE = 0.9423
Iteration: 130 ; Train RMSE = 0.9011 ; Test RMSE = 0.9406
Iteration: 140 ; Train RMSE = 0.8947 ; Test RMSE = 0.9381
Iteration: 150 ; Train RMSE = 0.8861 ; Test RMSE = 0.9349
Iteration: 160 ; Train RMSE = 0.8754 ; Test RMSE = 0.9312
Iteration: 170 ; Train RMSE = 0.8629 ; Test RMSE = 0.9274
Iteration: 180 ; Train RMSE = 0.8490 ; Test RMSE = 0.9239
Iteration: 190 ; Train RMSE = 0.8336 ; Test RMSE = 0.9207
Iteration: 200 ; Train RMSE = 0.8165 ; Test RMSE = 0.9177
Iteration: 210 ; Train RMSE = 0.7978 ; Test RMSE = 0.9151
Iteration: 220 ; Train RMSE = 0.7775 ; Test RMSE = 0.9129
Iteration: 230 ; Train RMSE = 0.7558 ; Test RMSE = 0.9113
Iteration: 240 ; 

Iteration: 290 ; Train RMSE = 0.6271 ; Test RMSE = 0.9114
Iteration: 300 ; Train RMSE = 0.6032 ; Test RMSE = 0.9129
K = 240
Iteration: 10 ; Train RMSE = 0.9664 ; Test RMSE = 0.9834
Iteration: 20 ; Train RMSE = 0.9420 ; Test RMSE = 0.9645
Iteration: 30 ; Train RMSE = 0.9314 ; Test RMSE = 0.9566
Iteration: 40 ; Train RMSE = 0.9253 ; Test RMSE = 0.9523
Iteration: 50 ; Train RMSE = 0.9215 ; Test RMSE = 0.9497
Iteration: 60 ; Train RMSE = 0.9188 ; Test RMSE = 0.9480
Iteration: 70 ; Train RMSE = 0.9167 ; Test RMSE = 0.9468
Iteration: 80 ; Train RMSE = 0.9150 ; Test RMSE = 0.9459
Iteration: 90 ; Train RMSE = 0.9134 ; Test RMSE = 0.9452
Iteration: 100 ; Train RMSE = 0.9117 ; Test RMSE = 0.9445
Iteration: 110 ; Train RMSE = 0.9097 ; Test RMSE = 0.9437
Iteration: 120 ; Train RMSE = 0.9070 ; Test RMSE = 0.9427
Iteration: 130 ; Train RMSE = 0.9031 ; Test RMSE = 0.9411
Iteration: 140 ; Train RMSE = 0.8975 ; Test RMSE = 0.9390
Iteration: 150 ; Train RMSE = 0.8899 ; Test RMSE = 0.9360
Iteration: 160 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-b941c30ce08d>", line 8, in <module>
    result = mf.test()
  File "<ipython-input-12-cc39d683a8c0>", line 97, in test
    self.sgd()
  File "<ipython-input-12-cc39d683a8c0>", line 48, in sgd
    prediction = self.get_prediction(i, j)
  File "<ipython-input-12-cc39d683a8c0>", line 42, in get_prediction
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above e

TypeError: object of type 'NoneType' has no len()

- 최적의 iterations 값 찾기

In [None]:
summary = []
for i in range(len(results)):
    RMSE = []
    for result in results[i]:
        RMSE.append(result[2])
    min = np.min(RMSE)
    j = RMSE.index(min)
    summary.append([index[i], j+1, RMSE[j]])

- 그래프 그리기

In [None]:
import matplotlib.pyplot as plt
plt.plot(index, [x[2] for x in summary])
plt.ylim(0.89, 0.94)
plt.xlabel('K')
plt.ylabel('RMSE')
plt.show()