In [12]:
import pandas as pd
import numpy as np

# csv 파일에서 불러오기
ratings = pd.read_csv('/home/sjkim/recommendSystem/finalproject/Amazon_ratings.csv',encoding = 'latin-1')
ratings = ratings[['user_id','item_id','rating']]
ratings

Unnamed: 0,user_id,item_id,rating
0,B00191WVF6,A0955928C2RRWOWZN7UC,4.0
1,B005WY3TMA,A0955928C2RRWOWZN7UC,4.0
2,B0090XWU8S,A0955928C2RRWOWZN7UC,4.0
3,B00FXYTLIK,A0955928C2RRWOWZN7UC,4.0
4,B00HMZG3YS,A0955928C2RRWOWZN7UC,4.0
...,...,...,...
99737,B009AYLDSU,AZZYW4YOE1B6E,5.0
99738,B00E055H5O,AZZYW4YOE1B6E,4.0
99739,B00E8HGWIK,AZZYW4YOE1B6E,5.0
99740,B00M58CMTM,AZZYW4YOE1B6E,5.0


In [13]:
# user_id와 item_id가 연속값이 아니기 때문에 일련번호로 만들어주기

ratings['user_id'] = ratings['user_id'].astype('category')
ratings['item_id'] = ratings['item_id'].astype('category')
ratings['user_id'] = ratings['user_id'].cat.codes       # 일련번호로
ratings['item_id'] = ratings['item_id'].cat.codes

In [14]:
ratings

Unnamed: 0,user_id,item_id,rating
0,608,0,4.0
1,1488,0,4.0
2,1781,0,4.0
3,2439,0,4.0
4,2534,0,4.0
...,...,...,...
99737,1833,6367,5.0
99738,2279,6367,4.0
99739,2305,6367,5.0
99740,2901,6367,5.0


In [15]:
# train test 분리
from sklearn.utils import shuffle
TRAIN_SIZE = 0.7
ratings = shuffle(ratings, random_state=12)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]   # 69819
ratings_test = ratings.iloc[cutoff:]    # 29923

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.layers import Dense, Concatenate, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adamax
from tensorflow.keras.callbacks import EarlyStopping

# Defining RMSE measure
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [17]:
# Variable 초기화 
K = 200                             # Latent factor 수 
reg = 0.0001                        # Regularization penalty
mu = ratings_train.rating.mean()    # 전체 평균 
N = len(set(ratings.user_id)) + 1   # Number of users (사용자 아이디와 아이템 아이디의 최대값 구하기. 나중에 embedding에 사용)
M = len(set(ratings.item_id)) + 1   # Number of items

# Keras model
user = Input(shape=(1, ))                                               # User input
item = Input(shape=(1, ))                                               # Item input
P_embedding = Embedding(N, K, embeddings_regularizer=l2(reg))(user)     # (N, 1, K)
Q_embedding = Embedding(M, K, embeddings_regularizer=l2(reg))(item)     # (M, 1, K)
user_bias = Embedding(N, 1, embeddings_regularizer=l2(reg))(user)       # User bias term (N, 1, )
item_bias = Embedding(M, 1, embeddings_regularizer=l2(reg))(item)       # Item bias term (M, 1, )

# Concatenate layers(dot 대신에 flatten하고 concat하기 -> concat된 layer에는 노드가 2K+2개(402개) 있음)
P_embedding = Flatten()(P_embedding)                                    # (K, )
Q_embedding = Flatten()(Q_embedding)                                    # (K, )
user_bias = Flatten()(user_bias)                                        # (1, )
item_bias = Flatten()(item_bias)                                        # (1, )
R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias])     # (2K + 2, )

# Neural network
R = Dense(2048)(R)
R = Activation('linear')(R)

# Adding more layers(레이어 더 쌓기)
R = Dense(1024)(R)
R = Activation('LeakyReLU')(R)
R = Dense(512)(R)
R = Activation('linear')(R)

R = Dense(1)(R)

es = EarlyStopping(monitor='val_loss', patience=5)

model = Model(inputs=[user, item], outputs=R)
model.compile(
  loss=RMSE,
  optimizer=SGD(lr=0.004, momentum=0.9),
  #optimizer=Adamax(lr=0.0005),
  metrics=[RMSE],
)
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 1, 200)       741400      ['input_3[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, 1, 200)       1273800     ['input_4[0][0]']                
                                                                                            

  super().__init__(name, **kwargs)


In [18]:
# Model fitting
result = model.fit(
  x=[ratings_train.user_id.values, ratings_train.item_id.values],
  y=ratings_train.rating.values - mu,
  epochs=100,
  batch_size=128,
  validation_data=(
    [ratings_test.user_id.values, ratings_test.item_id.values],
    ratings_test.rating.values - mu
  ),
  callbacks=[es]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [19]:
# Plot RMSE
#import matplotlib.pyplot as plt
#plt.plot(result.history['RMSE'], label="Train RMSE")
#plt.plot(result.history['val_RMSE'], label="Test RMSE")
#plt.xlabel('epoch')
#plt.ylabel('RMSE')
#plt.legend()
#plt.show()

# Prediction
user_ids = ratings_test.user_id.values[0:6]
item_ids = ratings_test.item_id.values[0:6]
predictions = model.predict([user_ids, item_ids]) + mu
print("Actuals: \n", ratings_test[0:6])
print()
print("Predictions: \n", predictions)

# 정확도(RMSE)를 계산하는 함수 
def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

user_ids = ratings_test.user_id.values
item_ids = ratings_test.item_id.values
y_pred = model.predict([user_ids, item_ids]) + mu
y_pred = np.ravel(y_pred, order='C')
y_true = np.array(ratings_test.rating)

print(RMSE2(y_true, y_pred))


Actuals: 
        user_id  item_id  rating
37390     1793     2387     5.0
63306     3460     4044     4.0
82194      764     5235     5.0
95469     2844     6081     4.0
45393     2053     2896     5.0
65916     2885     4212     5.0

Predictions: 
 [[5.0772386]
 [4.6553493]
 [4.4984326]
 [4.435581 ]
 [4.9539905]
 [4.135649 ]]
0.9637800102369048
