# 1. Matrix Factorization with SGD

In [12]:
import pandas as pd
import numpy as np

In [13]:
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(
    'u.user',
    sep='|',
    names=u_cols,
    encoding='latin-1'
)
users = users.set_index('user_id')

i_cols = ['movie_id','title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
          'Children\'s','Comedy','Crime','Documentary','Drama','Fantasy',
          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller',
          'War','Western']
movies = pd.read_csv(
    'u.item',
    sep='|',
    names=i_cols,
    encoding='latin-1'
)
movies = movies.set_index('movie_id')

r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(
    'u.data',
    sep='\t',
    names=r_cols,
    encoding='latin-1'
)
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

In [14]:
class MF():
  def __init__(self,ratings,hyper_params):
    self.R = np.array(ratings)
    self.num_users,self.num_items = np.shape(self.R)
    self.K = hyper_params['K']
    self.alpha = hyper_params['alpha']
    self.beta = hyper_params['beta']
    self.iterations = hyper_params['iterations']
    self.verbose = hyper_params['verbose']

  def rmse(self):
    xs,ys = self.R.nonzero()
    self.predictions = []
    self.errors = []

    for x,y in zip(xs,ys):
      prediction = self.get_prediction(x,y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x,y] - prediction)
    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)

    return np.sqrt(np.mean(self.errors**2))

  def train(self):
    self.P = np.random.normal(scale=1./self.K,
                              size=(self.num_users,self.K))
    self.Q = np.random.normal(scale=1./self.K,
                              size=(self.num_items,self.K))

    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows,columns = self.R.nonzero()
    self.samples = [(i,j,self.R[i,j]) for i,j in zip(rows,columns)]

    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse = self.rmse()
      training_process.append((i+1,rmse))
      if self.verbose:
        if (i+1) % 10 ==0:
          print('Iteration : %d ; train RMSE = %.4f'%(i+1,rmse))
    return training_process

  def get_prediction(self,i,j):
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,].T)
    return prediction

  def sgd(self):
    for i,j,r in self.samples:
      prediction = self.get_prediction(i,j)
      e = (r-prediction)

      self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
      self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

      self.P[i,:] += self.alpha * ((e * self.Q[j,:]) - (self.beta * self.P[i,:]))
      self.Q[j,:] += self.alpha * ((e * self.P[i,:]) - (self.beta * self.Q[j,:]))

R_temp = ratings.pivot(index='user_id',
                       columns='movie_id',
                       values='rating').fillna(0)

hyper_params = {
    'K' : 30,
    'alpha' : 0.001,
    'beta' : 0.02,
    'iterations' : 100,
    'verbose' : True
}

mf = MF(R_temp, hyper_params)

train_process = mf.train()

Iteration : 10 ; train RMSE = 0.9585
Iteration : 20 ; train RMSE = 0.9374
Iteration : 30 ; train RMSE = 0.9281
Iteration : 40 ; train RMSE = 0.9226
Iteration : 50 ; train RMSE = 0.9186
Iteration : 60 ; train RMSE = 0.9149
Iteration : 70 ; train RMSE = 0.9105
Iteration : 80 ; train RMSE = 0.9047
Iteration : 90 ; train RMSE = 0.8964
Iteration : 100 ; train RMSE = 0.8851


---
# 2. train/test 분리 MF

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
TRAIN_SIZE = 0.8

ratings = shuffle(ratings, random_state=2025)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [16]:
class NEW_MF():
  def __init__(self,ratings,hyper_params):
    self.R = np.array(ratings)
    self.num_users,self.num_items = np.shape(self.R)
    # MF weight 조절을 위한 parameter 
    self.K = hyper_params['K'] # K : n of latent factor 
    self.alpha = hyper_params['alpha'] # alpha : lr
    self.beta = hyper_params['beta'] # beta : norm coef
    self.iterations = hyper_params['iterations'] # iterations : SGD 반복 횟수 
    self.verbose = hyper_params['verbose'] # verbose : 학습과정을 중간중간 출력할지 여부 

    item_id_index = []
    index_item_id = []
    for i, one_id in enumerate(ratings):
      item_id_index.append([one_id,i])
      index_item_id.append([i,one_id])
    self.item_id_index = dict(item_id_index)
    self.index_item_id = dict(index_item_id)

    user_id_index = []
    index_user_id = []
    for i,one_id in enumerate(ratings.T):
      user_id_index.append([one_id,i])
      index_user_id.append([i,one_id])
    self.user_id_index = dict(user_id_index)
    self.index_user_id = dict(index_user_id)


  def rmse(self):
    xs, ys = self.R.nonzero() # 평점이 0이 아닌 요소의 index
    self.predictions = []
    self.errors = []
    # 평점이 있는 요소들에 대해 
    for x,y in zip(xs,ys):
      prediction = self.get_prediction(x,y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x,y] - prediction)
    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)
    return np.sqrt(np.mean(self.errors**2))


  def sgd(self):
    for i,j,r in self.samples:
      # 사용자 i, 아이템 j
      prediction = self.get_prediction(i,j) 
      e = (r - prediction) # 오차 계산 

      # 사용자 평가 경향 계산 및 업데이트
      self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
      # 아이템 평가 경향 계산 및 업데이트
      self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

      self.P[i,:] += self.alpha * ((e * self.Q[j,:]) - (self.beta * self.P[i,:]))
      self.Q[j,:] += self.alpha * ((e * self.P[i,:]) - (self.beta * self.Q[j,:]))

  def get_prediction(self,i,j):
    # 사용자 i, 아이템 j에 대한 평점 예측치 
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,:].T)
    return prediction


  # Test set 선정
  def set_test(self,ratings_test):
    test_set = []
    for i in range(len(ratings_test)):
      x = self.user_id_index[ratings_test.iloc[i,0]]
      y = self.item_id_index[ratings_test.iloc[i,1]]
      z = ratings_test.iloc[i,2]
      test_set.append([x,y,z])
      self.R[x,y] = 0
    self.test_set = test_set
    return test_set


  # Test set RMSE 
  def test_rmse(self):
    error = 0
    for one_set in self.test_set:
      predicted = self.get_prediction(one_set[0],one_set[1])
      # e => e^2
      error += pow(one_set[2] - predicted,2)
    return np.sqrt(error/len(self.test_set))

  def test(self):
    self.P = np.random.normal(scale=1./self.K,
                              size=(self.num_users,self.K))
    self.Q = np.random.normal(scale=1./self.K,
                              size=(self.num_items,self.K))

    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows,columns = self.R.nonzero()
    self.samples = [(i,j,self.R[i,j]) for i,j in zip(rows,columns)]

    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse1 = self.rmse()
      rmse2 = self.test_rmse()
      training_process.append((i+1,rmse1,rmse2))
      if self.verbose:
        if (i+1) % 10 == 0:
          print('Iteration : %d ; Train RMSE = %.4f ; Test RMSE = %.4f'% (i+1 ,rmse1,rmse2))
    return training_process

  def get_one_prediction(self,user_id,item_id):
    return self.get_prediction(self.user_id_index[user_id],
                               self.item_id_index[item_id])
  def full_prediction(self):
    return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.P.dot(self.Q.T)

In [17]:
R_temp = ratings.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
).fillna(0)

In [18]:
hyper_params = {
    'K':30,
    'alpha':0.001,
    'beta':0.02,
    'iterations':100,
    'verbose':True
}

In [19]:
mf = NEW_MF(R_temp,hyper_params)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration : 10 ; Train RMSE = 0.9669 ; Test RMSE = 0.9722
Iteration : 20 ; Train RMSE = 0.9430 ; Test RMSE = 0.9523
Iteration : 30 ; Train RMSE = 0.9322 ; Test RMSE = 0.9443
Iteration : 40 ; Train RMSE = 0.9257 ; Test RMSE = 0.9400
Iteration : 50 ; Train RMSE = 0.9211 ; Test RMSE = 0.9374
Iteration : 60 ; Train RMSE = 0.9173 ; Test RMSE = 0.9358
Iteration : 70 ; Train RMSE = 0.9136 ; Test RMSE = 0.9345
Iteration : 80 ; Train RMSE = 0.9095 ; Test RMSE = 0.9333
Iteration : 90 ; Train RMSE = 0.9043 ; Test RMSE = 0.9320
Iteration : 100 ; Train RMSE = 0.8973 ; Test RMSE = 0.9302


---
# 3. MF의 최적 파라미터 찾기

In [23]:
# 최적의 K 값 찾기
results = []
index = []

R_temp = ratings.pivot(index='user_id',
                       columns='movie_id',
                       values='rating').fillna(0)
for K in range(50,251,25):
  print(f'K : {K}')
  hyper_params = {
      'K': K,
      'alpha' : 0.001,
      'beta' : 0.02,
      'iterations' : 30,
      'verbose' : True
  }
  mf = NEW_MF(R_temp,
              hyper_params)
  test_set = mf.set_test(ratings_test)
  result = mf.test()
  index.append(K)
  results.append(result)

K : 50
Iteration : 10 ; Train RMSE = 0.9671 ; Test RMSE = 0.9721
Iteration : 20 ; Train RMSE = 0.9434 ; Test RMSE = 0.9522
Iteration : 30 ; Train RMSE = 0.9329 ; Test RMSE = 0.9442
K : 75
Iteration : 10 ; Train RMSE = 0.9672 ; Test RMSE = 0.9721
Iteration : 20 ; Train RMSE = 0.9436 ; Test RMSE = 0.9522
Iteration : 30 ; Train RMSE = 0.9332 ; Test RMSE = 0.9442
K : 100
Iteration : 10 ; Train RMSE = 0.9673 ; Test RMSE = 0.9721
Iteration : 20 ; Train RMSE = 0.9438 ; Test RMSE = 0.9522
Iteration : 30 ; Train RMSE = 0.9334 ; Test RMSE = 0.9442
K : 125
Iteration : 10 ; Train RMSE = 0.9673 ; Test RMSE = 0.9721
Iteration : 20 ; Train RMSE = 0.9438 ; Test RMSE = 0.9522
Iteration : 30 ; Train RMSE = 0.9335 ; Test RMSE = 0.9442
K : 150
Iteration : 10 ; Train RMSE = 0.9674 ; Test RMSE = 0.9721
Iteration : 20 ; Train RMSE = 0.9439 ; Test RMSE = 0.9522
Iteration : 30 ; Train RMSE = 0.9336 ; Test RMSE = 0.9442
K : 175
Iteration : 10 ; Train RMSE = 0.9674 ; Test RMSE = 0.9721
Iteration : 20 ; Train RMS

In [25]:
summary = []
for i in range(len(results)):
  RMSE = []
  for result in results[i]:
    RMSE.append(result[2])
  min = np.min(RMSE)
  j = RMSE.index(min)
  summary.append([index[i],j+1,RMSE[j]])

In [46]:
import plotly.express as px
summary = pd.DataFrame(summary)
px.line(data_frame=summary, x=0, y=2, range_y=[0.944, 0.945])