# 하이브리드 추천 시스템

In [6]:
import os
# csv 파일에서 불러오기
import pandas as pd
import random 
import numpy as np

from sklearn.model_selection import train_test_split

base_src = ''
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src, sep='\t',
                      names=r_cols, encoding='latin-1')

ratings_train, ratings_test = train_test_split(
    ratings,
    test_size=0.2,
    shuffle=True,
    random_state=2022
)

def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred)) ** 2))


def recommender_1(recom_list):
    recommendations = []
    for pair in recom_list:
        recommendations.append(random.random() * 4 + 1)
    return np.array(recommendations)


def recommender_2(recom_list):
    recommendations = []
    for pair in recom_list:
        recommendations.append(random.random() * 4 + 1)
    return np.array(recommendations)

In [10]:
weight = [0.8, 0.2]
recom_list = np.array(ratings_test)
predictions_1 = recommender_1(recom_list)
predictions_2 = recommender_2(recom_list)

predictions = predictions_1 * weight[0] + predictions_2 * weight[1]
RMSE2(recom_list[:,2], predictions)

1.573964211577989

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import os
import numpy as np
import pandas as pd


class NEW_MF():
  def __init__(self, ratings, hyper_params):
    self.R = np.array(ratings)
    # 사용자 수 (num_users) 와 아이템 수(num_items)를 받아온다
    self.num_users, self.num_items = np.shape(self.R)
    # 아래는 MF weight 조절을 위한 하이퍼파라미터이다
    # K : 잠재요인(latent factor)의 수
    self.K = hyper_params['K']
    # alpha 학습률
    self.alpha = hyper_params['alpha']
    # beta  정규화 계수
    self.beta = hyper_params['beta']
    # iterations SGD 계산을 할 때의 반복 횟수
    self.iterations = hyper_params['iterations']
    # verbose SGD 학습 과정을 중간중간에 출력할 것인지에 대한 여부
    self.verbose = hyper_params['verbose']

    item_id_index = []
    index_item_id = []
    for i, one_id in enumerate(ratings):
      item_id_index.append([one_id, i])
      index_item_id.append([i, one_id])
    self.item_id_index = dict(item_id_index)
    self.index_item_id = dict(index_item_id)

    user_id_index = []
    index_user_id = []
    for i, one_id in enumerate(ratings.T):
      user_id_index.append([one_id, i])
      index_user_id.append([i, one_id])
    self.user_id_index = dict(user_id_index)
    self.index_user_id = dict(index_user_id)    


  def rmse(self):
    # self.R 에서 평점이 있는 요소의 인덱스를 가져온다 
    xs, ys = self.R.nonzero()
    # prediction 과 error를 담을 리스트 변수 초기화 
    self.predictions = []
    self.errors = [] 
    # 평점이 있는 요소 (사용자 x, 아이템 y) 각각에 대해서 아래의 코드를 실행한다
    for x, y in zip(xs, ys):
      # 사용자 x, 아이템 y 에 대해 평점 예측치를 get_prediction() 함수를 사용해서 계산한다.
      prediction = self.get_prediction(x, y)
      # 예측값을 예측값 리스트에 추가한다 
      self.predictions.append(prediction)
      # 실제값 과 예측값의 차이를 계산해서 오차값 리스트에 추가한다
      self.errors.append(self.R[x,y] - prediction)
    # 예측값 리스트와 오차값 리스트를 numpy array 형태로 변환한다.
    self.prediction = np.array(self.predictions)
    self.errors = np.array(self.errors)
    # error를 활용해서 RMSE 도출 
    return np.sqrt(np.mean(self.errors ** 2))

  def sgd(self):
    for i, j, r in self.samples:
      # 사용자 i, 아이템 j 에 대한 평저 예측치 계산
      prediction = self.get_prediction(i, j)
      # 실제 평점과 비교한 오차 계산
      e = (r-prediction)

      # 사용자 평가 경향 계산 및 업데이트
      self.b_u[i] += self.alpha * (e - (self.beta * self.b_u[i]))
      # 아이템 평가 경향 계산 및 업데이트
      self.b_d[j] += self.alpha * (e - (self.beta * self.b_d[j]))

      # P 행렬 계산 및 업데이트
      self.P[i, :] += self.alpha * ((e * self.Q[j,:]) - (self.beta * self.P[i, :]))
      # Q 행렬 계산 및 업데이트
      self.Q[j, :] += self.alpha * ((e * self.P[i,:]) - (self.beta * self.Q[j, :]))
  
  def get_prediction(self, i, j):
    # 사용자 i, 아이템 j에 대한 평점 예측치를 앞에서 배웠던 식을 이용해서 구한다 
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i,:].dot(self.Q[j,].T)
    return prediction
 
  #  Test set 선정
  def set_test(self, ratings_test):
    test_set = []
    for i in range(len(ratings_test)):
      x = self.user_id_index[ratings_test.iloc[i, 0]]
      y = self.item_id_index[ratings_test.iloc[i, 1]]
      z = ratings_test.iloc[i, 2]
      test_set.append([x, y, z])
      self.R[x, y] = 0
    self.test_set = test_set 
    return test_set

  # Test set RMSE 계산 
  def test_rmse(self):
    error = 0
    for one_set in self.test_set:
      predicted = self.get_prediction(one_set[0], one_set[1])
      # e => e^2
      error += pow(one_set[2] - predicted, 2)
    return np.sqrt(error/len(self.test_set))

  def test(self):
    self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
    self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()
    self.samples = [(i,j,self.R[i,j]) for i, j in zip(rows, columns)]

    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse1 = self.rmse()
      rmse2 = self.test_rmse()
      training_process.append((i+1, rmse1, rmse2))
      if self.verbose:
        if (i+1) % 10 == 0:
          print('Iteration : %d ; train RMSE = %.4f ; TEST RMSE = %.4f' % (i+1, rmse1, rmse2))
    return training_process

  def get_one_prediction(self, user_id, item_id):
    return self.get_prediction(self.user_id_index[user_id],
                               self.item_id_index[item_id])
    
  def full_prediction(self):
    return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.P.dot(self.Q.T)

base_src = ''
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(u_data_src, sep='\t',
                      names=r_cols, encoding='latin-1')

R_temp = ratings.pivot(index='user_id',
                       columns='movie_id',
                       values='rating').fillna(0)

hyper_params = {
    'K' : 30,
    'alpha' : 0.001,
    'beta' : 0.02,
    'iterations' : 100,
    'verbose': True
}


mf = NEW_MF(R_temp, hyper_params)
train_set = mf.set_test(ratings_test)
result = mf.test()

########################################################

ratings_matrix = ratings_train.pivot(index='user_id', columns='movie_id', values='rating')

### 사용자 평가 경향을 고려한 함수 
rating_mean = ratings_matrix.mean(axis=1)
rating_bias = (ratings_matrix.T - rating_mean).T

# 코사인 유사도를 구하기 위해 rating 값을 복사하고 계산 시 NaN 값 에러 대비를 위해 결측치 0으로 대체
matrix_dummy = ratings_matrix.copy().fillna(0)
# 모든 사용자간 코사인 유사도 구함
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
# 필요한 값 조회를 위해 인덱스 및 컬럼명 지정 
user_similarity = pd.DataFrame(user_similarity,
                               index=ratings_matrix.index,
                               columns=ratings_matrix.index)

### 사용자 평가 경향을 고려한 함수 
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
  if movie_id in rating_bias.columns:
    sim_scores = user_similarity[user_id].copy()
    movie_ratings = rating_bias[movie_id].copy()
    none_rating_idx = movie_ratings[movie_ratings.isnull()].index
    movie_ratings = movie_ratings.drop(none_rating_idx)
    sim_scores = sim_scores.drop(none_rating_idx)

    if neighbor_size == 0:
      prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_id]
    
    else:
      if len(sim_scores) > 1:
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        movie_ratings = np.array(movie_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        movie_ratings = movie_ratings[user_idx][-neighbor_size:]
        prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        prediction = prediction + rating_mean[user_id]
      else:
        prediction = rating_mean[user_id]
  else:
    prediction = rating_mean[user_id] 
  return prediction 

def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred)) ** 2))


Iteration : 10 ; train RMSE = 0.9655 ; TEST RMSE = 0.9755
Iteration : 20 ; train RMSE = 0.9414 ; TEST RMSE = 0.9569
Iteration : 30 ; train RMSE = 0.9305 ; TEST RMSE = 0.9496
Iteration : 40 ; train RMSE = 0.9240 ; TEST RMSE = 0.9459
Iteration : 50 ; train RMSE = 0.9193 ; TEST RMSE = 0.9436
Iteration : 60 ; train RMSE = 0.9154 ; TEST RMSE = 0.9422
Iteration : 70 ; train RMSE = 0.9116 ; TEST RMSE = 0.9409
Iteration : 80 ; train RMSE = 0.9072 ; TEST RMSE = 0.9397
Iteration : 90 ; train RMSE = 0.9016 ; TEST RMSE = 0.9381
Iteration : 100 ; train RMSE = 0.8941 ; TEST RMSE = 0.9360


In [19]:
# Hybrid 추천 알고리즘
def recommender_1(recom_list, mf):
  recommendations = np.array([
    mf.get_one_prediction(user, movie) for (user, movie) in recom_list
  ])
  return recommendations


def recommender_2(recom_list, neighbor_size = 0):
  recommendations = np.array([
    CF_knn_bias(user, movie, neighbor_size) for (user, movie) in recom_list
  ])
  return recommendations

In [26]:
recom_list = np.array(ratings_test.iloc[:, [0, 1]])

predictions_1 = recommender_1(recom_list, mf)
predictions_2 = recommender_2(recom_list, 30)

In [27]:
print('reco 1 :' , RMSE2(ratings_test.iloc[:,2], predictions_1))
print('reco 2 :' , RMSE2(ratings_test.iloc[:,2], predictions_2))

weight = [0.8, 0.2]
predictions = predictions_1 * weight[0] + predictions_2 * weight[1]

print('reco 2 :' , RMSE2(ratings_test.iloc[:,2], predictions))

reco 1 : 0.936006922579507
reco 2 : 0.9415889228345732
reco 2 : 0.9328302761320403


In [33]:
result = []
weight_rate = []
for i in np.arange(0, 1, 0.1):
    weight = [i, 1.0-i]
    predictions = predictions_1 * weight[0] + predictions_2 * weight[1]
    print('reco :' , weight[0], weight[1], RMSE2(ratings_test.iloc[:,2], predictions))
    result.append(RMSE2(ratings_test.iloc[:,2], predictions))
    weight_rate.append(weight)
print(min(result))
index_min = result.index(min(result))
print(weight_rate[index_min])


reco : 0.0 1.0 0.9415889228345732
reco : 0.1 0.9 0.9386283529898654
reco : 0.2 0.8 0.9361948346158432
reco : 0.30000000000000004 0.7 0.9342924860981852
reco : 0.4 0.6 0.9329245568055822
reco : 0.5 0.5 0.9320933996630806
reco : 0.6000000000000001 0.3999999999999999 0.9318004510587498
reco : 0.7000000000000001 0.29999999999999993 0.9320462184807804
reco : 0.8 0.19999999999999996 0.9328302761320403
reco : 0.9 0.09999999999999998 0.9341512686102061
0.9318004510587498
[0.6000000000000001, 0.3999999999999999]
