<a href="https://colab.research.google.com/github/soaeng/recomSys/blob/main/MF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📌 SGD를 사용한 MF 기본 알고리즘

In [None]:
import os
import numpy as np
import pandas as pd

base_src = 'drive/MyDrive/RecoSys/Data'
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src, sep = '\t', names = r_cols, encoding = 'latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

In [None]:
# @title MF 클래스화

class MF() :
  def __init__(self, ratings, hyper_params):
    self.R = np.array(ratings)
    self.num_users, self.num_items = np.shape(self.R)
    self.K = hyper_params['K']
    self.alpha = hyper_params['alpha']
    self.beta = hyper_params['beta']
    self.iterations = hyper_params['iterations']
    self.verbose = hyper_params['verbose']

  def rmse(self):
    # relf.R에서 평점이 있는 요소의 인덱스를 가져옴
    xs, ys = self.R.nonzero()
    # prediction과 error를 담을 리스트 변수 초기화
    self.predictions = []
    self.errors = []

    #평점이 있는 요소(사용자 x, 아이템 y) 각각에 대해 아래의 코드 실행
    for x, y in zip(xs, ys):
      # 사용자 x, 아이템 y에 대해 평점 예측치를 get_prediction() 함수 사용해 계산
      prediction = self.get_prediction(x, y)
      # 예측값을 리스트에 추가
      self.predictions.append(prediction)
      # 실제값(R)과의 차이를 계산해 오차값 리스트에 추가
      self.errors.append(self.R[x, y] - prediction)

    # 예측값 리스트와 오차값 리스트를 numpy array 형태로 변환
    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)

    # error 활용해 RMSE 도출
    return np.sqrt(np.mean(self.errors ** 2))

  def train(self):
    self.P = np.random.normal(scale = 1. / self.K, size = (self.num_users, self.K))
    self.Q = np.random.normal(scale = 1. / self.K, size = (self.num_items, self.K))
    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()
    self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]

    training_process = []

    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse = self.rmse()
      training_process.append((i + 1, rmse))
      if self.verbose:
        if (i + 1) % 10 == 0:
          print('Iteration: %d; train RMSE = %.4f' %(i+1, rmse))

    return training_process

  def get_prediction(self, i, j):
    # 사용자 i, 아이템 j에 대한 평점 예측치를 구함
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j,].T)
    return prediction

  def sgd(self):
    for i, j, r in self.samples:
      # 사용자 i, 아이템 j에 대한 평점 예측치 계산
      prediction = self.get_prediction(i, j)
      # 실제 평점과 비교한 오차 계산
      e = (r - prediction)

      # 사용자 평가 경향 계산 및 업데이트
      self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
      # 아이템 평가 경향 계산 및 업데이트
      self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

      # P 행렬 계산 및 업데이트
      self.P[i,:] += self.alpha * ((e * self.Q[j,:]) - (self.beta * self.P[i,:]))
      # Q 행렬 계산 및 업데이트
      self.Q[j,:] += self.alpha * ((e * self.P[i,:]) - (self.beta * self.Q[j,:]))


In [None]:
# @title MF 적용 결과

R_temp = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)

hyper_params = {
    'K': 30,
    'alpha': 0.001,
    'beta': 0.02,
    'iterations': 100,
    'verbose': True
}

mf = MF(R_temp, hyper_params)

train_process = mf.train()

Iteration: 10; train RMSE = 0.9585
Iteration: 20; train RMSE = 0.9374
Iteration: 30; train RMSE = 0.9281
Iteration: 40; train RMSE = 0.9225
Iteration: 50; train RMSE = 0.9183
Iteration: 60; train RMSE = 0.9143
Iteration: 70; train RMSE = 0.9096
Iteration: 80; train RMSE = 0.9030
Iteration: 90; train RMSE = 0.8937
Iteration: 100; train RMSE = 0.8816


# 📌 train/test 분리 MF 알고리즘


In [32]:
import os
import numpy as np
import pandas as pd

base_src = 'drive/MyDrive/RecoSys/Data'
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src, sep = '\t', names = r_cols, encoding = 'latin-1')
# timestamp 제거
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

# train / test set 분리
from sklearn.utils import shuffle
TRAIN_SIZE = .75
# (사용자 - 영화 - 평점)
ratings = shuffle(ratings, random_state = 2021)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

class NEW_MF():
  def __init__(self, ratings, hyper_params):
    self.R = np.array(ratings)
    self.num_users, self.num_items = np.shape(self.R)
    self.K = hyper_params['K']
    self.alpha = hyper_params['alpha']
    self.beta = hyper_params['beta']
    self.iterations = hyper_params['iterations']
    self.verbose = hyper_params['verbose']

    # about item_id
    item_id_index = []
    index_item_id = []
    for i, one_id in enumerate(ratings):
      item_id_index.append([one_id, i])
      index_item_id.append([i, one_id])
    self.item_id_index = dict(item_id_index)
    self.index_item_id = dict(index_item_id)

    # about user_id
    user_id_index = []
    index_user_id = []
    for i, one_id in enumerate(ratings.T):
      user_id_index.append([one_id, i])
      index_user_id.append([i, one_id])
    self.user_id_index = dict(user_id_index)
    self.index_user_id = dict(index_user_id)

  def rmse(self):
    xs, ys = self.R.nonzero()
    self.predictions = []
    self.errors = []

    for x, y in zip(xs, ys):
      prediction = self.get_prediction(x, y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x, y] - prediction)
    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)

    return np.sqrt(np.mean(self.errors ** 2))

  def sgd(self):
    for i, j, r in self.samples:
      prediction = self.get_prediction(i, j)
      e = (r - prediction)

      self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
      self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

      self.P[i,:] += self.alpha * ((e * self.Q[j,:]) - (self.beta * self.P[i,:]))
      self.Q[j,:] += self.alpha * ((e * self.P[i,:]) - (self.beta * self.Q[j,:]))

  def get_prediction(self, i, j):
    # 사용자 i, 아이템 j에 대한 평점 예측치를 구함
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
    return prediction

  # Test set 선정
  def set_test(self, ratings_test):
    test_set = []
    for i in range(len(ratings_test)):
      x = self.user_id_index[ratings_test.iloc[i, 0]]
      y = self.item_id_index[ratings_test.iloc[i, 1]]
      z = ratings_test.iloc[i, 2]
      test_set.append([x, y, z])
      self.R[x, y] = 0
    self.test_set = test_set
    return test_set

  # Test set RMSE 계산
  def test_rmse(self):
    error = 0
    for one_set in self.test_set:
      predicted = self.get_prediction(one_set[0], one_set[1])
      # e => e^2
      error += pow(one_set[2] - predicted, 2)
    return np.sqrt(error/len(self.test_set))

  def test(self):
    self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
    self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))
    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()
    self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]
    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      # training
      rmse1 = self.rmse()
      # test
      rmse2 = self.test_rmse()
      training_process.append((i+1, rmse1, rmse2))
      if self.verbose:
        if (i+1) % 10 == 0:
          print('Itertion: %d: Train RMSE = %.4f; Test RMSE = %.4f' % (i+1, rmse1, rmse2))
    return training_process

  def get_one_prediction(self, user_id, item_id):
    return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])

  def full_prediction(self):
    return self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T)


In [34]:
R_temp = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)

hyper_params = {
    'K': 30,
    'alpha': 0.001,
    'beta': .02,
    'iterations': 100,
    'verbose': True
}

mf = NEW_MF(R_temp, hyper_params)
test_set = mf.set_test(ratings_test)
result = mf.test()

Itertion: 10: Train RMSE = 0.9666; Test RMSE = 0.9807
Itertion: 20: Train RMSE = 0.9412; Test RMSE = 0.9622
Itertion: 30: Train RMSE = 0.9298; Test RMSE = 0.9551
Itertion: 40: Train RMSE = 0.9229; Test RMSE = 0.9514
Itertion: 50: Train RMSE = 0.9180; Test RMSE = 0.9492
Itertion: 60: Train RMSE = 0.9141; Test RMSE = 0.9477
Itertion: 70: Train RMSE = 0.9104; Test RMSE = 0.9466
Itertion: 80: Train RMSE = 0.9064; Test RMSE = 0.9455
Itertion: 90: Train RMSE = 0.9016; Test RMSE = 0.9443
Itertion: 100: Train RMSE = 0.8954; Test RMSE = 0.9427


In [28]:
print(mf.full_prediction())

[[3.84448849 3.34655001 3.01282037 ... 3.34464347 3.49498793 3.45017661]
 [3.77589185 3.2698036  2.900525   ... 3.26310705 3.38426143 3.34139232]
 [3.42320799 2.88844024 2.51299584 ... 2.87665867 2.99527565 2.95602068]
 ...
 [4.15560831 3.61088308 3.22257134 ... 3.58340108 3.69098826 3.68371235]
 [4.2873094  3.78036089 3.40794359 ... 3.76237229 3.8796475  3.85337402]
 [3.82807395 3.32011843 2.89875605 ... 3.28523877 3.4329342  3.38456292]]


In [35]:
print(mf.get_one_prediction(1, 2))

3.378895826999353
