In [13]:
import pandas as pd
import numpy as np

In [14]:
ratings = pd.read_csv('/home/sjkim/recommendSystem/finalproject/change_df.csv',index_col=0)
ratings = ratings[['user_id','item_id','rating','compound_score']]
ratings.head()

Unnamed: 0,user_id,item_id,rating,compound_score
0,B00191WVF6,A0955928C2RRWOWZN7UC,4.0,0.0
1,B005WY3TMA,A0955928C2RRWOWZN7UC,4.0,0.4404
2,B0090XWU8S,A0955928C2RRWOWZN7UC,4.0,0.0
3,B00FXYTLIK,A0955928C2RRWOWZN7UC,4.0,0.7264
4,B00HMZG3YS,A0955928C2RRWOWZN7UC,4.0,0.6369


In [15]:
# user_id와 item_id가 연속값이 아니기 때문에 일련번호로 만들어주기

ratings['user_id'] = ratings['user_id'].astype('category')
ratings['item_id'] = ratings['item_id'].astype('category')
#ratings['years'] = ratings['years'].astype('category')
ratings['user_id'] = ratings['user_id'].cat.codes       # 일련번호로
ratings['item_id'] = ratings['item_id'].cat.codes
#ratings['years'] = ratings['years'].cat.codes
ratings

Unnamed: 0,user_id,item_id,rating,compound_score
0,608,0,4.0,0.0000
1,1488,0,4.0,0.4404
2,1781,0,4.0,0.0000
3,2439,0,4.0,0.7264
4,2534,0,4.0,0.6369
...,...,...,...,...
99737,1833,6367,5.0,0.9798
99738,2279,6367,4.0,0.8402
99739,2305,6367,5.0,0.9940
99740,2901,6367,5.0,0.9856


In [16]:
from sklearn.utils import shuffle

df = shuffle(ratings, random_state=12)

# train:test 7:3으로 나누기 위해서
cutoff = int(len(df)*0.7)

train_df = df[:cutoff]
test_df = df[cutoff:]

In [17]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,compound_score
24793,1846,1577,5.0,0.8546
14890,2635,937,2.0,0.0258
32933,284,2106,5.0,0.4588
14756,2271,928,5.0,0.7717
81169,2943,5172,5.0,0.0


In [18]:
# train set에 있는 아이템별 감성점수 평균
mean_sentiment_score = train_df.groupby(['item_id']).mean()['compound_score'].to_frame()
mean_sentiment_score

Unnamed: 0_level_0,compound_score
item_id,Unnamed: 1_level_1
0,0.285811
1,0.789224
2,0.373988
3,0.056814
4,0.363329
...,...
6363,0.142800
6364,0.936531
6365,0.731400
6366,0.000000


In [19]:
# test 셋에서 감성점수(compound_score) 삭제 (치팅 방지)
test_df = test_df.drop('compound_score',axis=1)

In [20]:
# test 셋에 train set에 있는 감성점수 붙이기 (치팅 방지)
test_df = pd.merge(mean_sentiment_score,test_df,how='inner',on='item_id')
test_df.head()

Unnamed: 0,item_id,compound_score,user_id,rating
0,0,0.285811,2534,4.0
1,0,0.285811,3279,4.0
2,0,0.285811,1488,4.0
3,1,0.789224,3401,5.0
4,1,0.789224,1892,5.0


### Neural MF

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.layers import Dense, Concatenate, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adamax
from tensorflow.keras.callbacks import EarlyStopping

# Defining RMSE measure
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [22]:
# Variable 초기화 
K = 200                             # Latent factor 수 
reg = 0.0001                        # Regularization penalty
mu = train_df.rating.mean()         # train 셋의 rating 전체 평균 
N = len(set(ratings.user_id)) + 1   # Number of users (사용자 아이디와 아이템 아이디의 최대값 구하기. 나중에 embedding에 사용)
M = len(set(ratings.item_id)) + 1   # Number of items


# Keras model
user = Input(shape=(1, ))                                               # User input
item = Input(shape=(1, ))                                               # Item input
P_embedding = Embedding(N, K, embeddings_regularizer=l2(reg))(user)     # (N, 1, K)
Q_embedding = Embedding(M, K, embeddings_regularizer=l2(reg))(item)     # (M, 1, K)
user_bias = Embedding(N, 1, embeddings_regularizer=l2(reg))(user)       # User bias term (N, 1, )
item_bias = Embedding(M, 1, embeddings_regularizer=l2(reg))(item)       # Item bias term (M, 1, )

# Concatenate layers(dot 대신에 flatten하고 concat하기 -> concat된 layer에는 노드가 2K+2개(402개) 있음)
P_embedding = Flatten()(P_embedding)                                    # (K, )
Q_embedding = Flatten()(Q_embedding)                                    # (K, )
user_bias = Flatten()(user_bias)                                        # (1, )
item_bias = Flatten()(item_bias)                                        # (1, )
R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias])     # (2K + 2, )

# sentiment 변수(compound_score) 추가 -> 그냥 바로 concat
senti = Input(shape=(1, ))
R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias, senti])

# Neural network
R = Dense(2048)(R)
R = Activation('linear')(R)

# Adding more layers(레이어 더 쌓기)
R = Dense(1024)(R)
R = Activation('LeakyReLU')(R)
R = Dense(512)(R)
R = Activation('linear')(R)

R = Dense(1)(R)

es = EarlyStopping(monitor='val_loss', patience=10)

model = Model(inputs=[user, item, senti], outputs=R)
model.compile(
  loss=RMSE,
  optimizer=SGD(lr=0.004, momentum=0.9),
  #optimizer=Adamax(lr=0.0005),
  metrics=[RMSE],
)
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 1, 200)       741400      ['input_4[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, 1, 200)       1273800     ['input_5[0][0]']                
                                                                                            

  super().__init__(name, **kwargs)


In [23]:
# Model fitting
result = model.fit(
  x=[train_df.user_id.values, train_df.item_id.values, train_df.compound_score.values],
  y=train_df.rating.values - mu,
  epochs=100,
  batch_size=128,
  validation_data=(
    [test_df.user_id.values, test_df.item_id.values, test_df.compound_score.values],
    test_df.rating.values - mu
  ),
  callbacks=[es]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


In [25]:
# Plot RMSE
#import matplotlib.pyplot as plt
#plt.plot(result.history['RMSE'], label="Train RMSE")
#plt.plot(result.history['val_RMSE'], label="Test RMSE")
#plt.xlabel('epoch')
#plt.ylabel('RMSE')
#plt.legend()
#plt.show()

# Prediction
user_ids = test_df.user_id.values[0:6]
item_ids = test_df.item_id.values[0:6]
compound_scores = test_df.compound_score.values[0:6]
predictions = model.predict([user_ids, item_ids, compound_scores]) + mu
print("Actuals: \n", test_df[0:6])
print()
print("Predictions: \n", predictions)

# 정확도(RMSE)를 계산하는 함수 
def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

user_ids = test_df.user_id.values
item_ids = test_df.item_id.values
compound_scores = test_df.compound_score.values
y_pred = model.predict([user_ids, item_ids, compound_scores]) + mu
y_pred = np.ravel(y_pred, order='C')
y_true = np.array(test_df.rating)

print(RMSE2(y_true, y_pred))


Actuals: 
    item_id  compound_score  user_id  rating
0        0        0.285811     2534     4.0
1        0        0.285811     3279     4.0
2        0        0.285811     1488     4.0
3        1        0.789224     3401     5.0
4        1        0.789224     1892     5.0
5        1        0.789224     3299     5.0

Predictions: 
 [[4.191158 ]
 [4.304524 ]
 [4.38154  ]
 [5.2234406]
 [5.119597 ]
 [5.0305696]]
0.9838923047901873
