In [59]:
# Context-aware hybrid 

# 필요한 모듈 모두 불러오기
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Add, Multiply, Concatenate, Embedding
from tensorflow.keras.layers import Dropout, Activation, Flatten
from tensorflow.keras.optimizers import SGD, Adam, Adamax


In [60]:
# csv 파일 불러오기
ratings = pd.read_csv('/home/sjkim/recommendSystem/finalproject/change_df.csv', encoding='latin-1', index_col=0)
ratings['rating'] = ratings['rating'].astype(int)
ratings['votes'] = ratings['votes'].astype(int)
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'])

# 요일 추가하기
ratings['day'] = ratings['timestamp'].dt.day_name()
# 월 추가하기
ratings['month'] = ratings['timestamp'].dt.month_name()

# 필요없는 컬럼 삭제
ratings = ratings.drop(columns = ['timestamp','text', 'neg_score', 'neu_score', 'pos_score'])
ratings

Unnamed: 0,item_id,user_id,rating,votes,years,compound_score,day,month
0,A0955928C2RRWOWZN7UC,B00191WVF6,4,0,2017,0.0000,Friday,February
1,A0955928C2RRWOWZN7UC,B005WY3TMA,4,0,2015,0.4404,Sunday,June
2,A0955928C2RRWOWZN7UC,B0090XWU8S,4,0,2017,0.0000,Saturday,April
3,A0955928C2RRWOWZN7UC,B00FXYTLIK,4,0,2015,0.7264,Friday,July
4,A0955928C2RRWOWZN7UC,B00HMZG3YS,4,0,2015,0.6369,Friday,July
...,...,...,...,...,...,...,...,...
99737,AZZYW4YOE1B6E,B009AYLDSU,5,2,2013,0.9798,Tuesday,December
99738,AZZYW4YOE1B6E,B00E055H5O,4,0,2015,0.8402,Monday,May
99739,AZZYW4YOE1B6E,B00E8HGWIK,5,0,2013,0.9940,Sunday,December
99740,AZZYW4YOE1B6E,B00M58CMTM,5,4,2014,0.9856,Wednesday,October


In [61]:
def scorechange(n):
  if n<=0 :
    r = 'neg'
  else : 
    r = 'pos'
  return r

def votechange(n):
  if n==0:
    r = 'not voted'
  else :
    r = 'voted'
  return r

ratings['votes'] = ratings['votes'].apply(votechange)
ratings.rename(columns = {'compound_score' : 'sentiment'}, inplace = True)
ratings['sentiment'] = ratings['sentiment'].apply(scorechange)

In [62]:
# 요일변수 더비변수로 만들기
daydummies = ratings.copy()
daydummies = daydummies.drop(['user_id', 'item_id', 'rating', 'votes', 'years', 'sentiment', 'month'], axis=1)
daydummies = pd.get_dummies(daydummies, columns=['day'])

# 월변수 더비변수로 만들기
monthdummies = ratings.copy()
monthdummies = monthdummies.drop(['user_id', 'item_id', 'rating', 'votes', 'years', 'sentiment', 'day'], axis=1)
monthdummies = pd.get_dummies(monthdummies, columns=['month'])

In [63]:
# item_id + 요일변수들만 넣은 데이터 프레임
days = ratings.join(daydummies, how='left')
days = days[['item_id', 'day_Monday', 'day_Tuesday', 'day_Wednesday', 'day_Thursday', 'day_Friday', 'day_Saturday', 'day_Sunday']]

# item_id + 월변수들만 넣은 데이터 프레임
months = ratings.join(monthdummies, how='left')
months = months[['item_id', 'month_January', 'month_February', 'month_March', 'month_April', 'month_May', 'month_June', 'month_July', 'month_August', 'month_September', 'month_October', 'month_November', 'month_December']]

In [64]:
# 제품 가장 많이 팔린 요일 구하기
a = days.groupby('item_id').sum(1).values.argmax(axis=1)
b = days.groupby('item_id').sum(1)
b['day'] = a
days_new = b['day'].to_frame()

# 제품 가장 많이 팔린 달 구하기
c = months.groupby('item_id').sum(1).values.argmax(axis=1)
d = months.groupby('item_id').sum(1)
d['month'] = c
months_new = d['month'].to_frame()

def get_day_name(n):
    if n == 0:
        d = 'Monday'
    elif n == 1:
        d = 'Tuesday'
    elif n == 2:
        d = 'Wednesday'
    elif n == 3:
        d = 'Thursday'
    elif n == 4:
        d = 'Friday'
    elif n == 5:
        d = 'Saturday'
    else:
        d = 'Sunday'
    return d

def get_month_name(n):
    if n == 0:
        m = 'January'
    elif n == 1:
        m = 'February'
    elif n == 2:
        m = 'March'
    elif n == 3:
        m = 'April'
    elif n == 4:
        m = 'May'
    elif n == 5:
        m = 'June'
    elif n == 6:
        m = 'July'
    elif n == 7:
        m = 'August'
    elif n == 8:
        m = 'September'
    elif n == 9:
        m = 'October'
    elif n == 10:
        m = 'November'
    else:
        m = 'December'
    return m

days_new['day'] = days_new['day'].apply(get_day_name)
months_new['month'] = months_new['month'].apply(get_month_name)

In [65]:
# 다시 one-hot encoding해주기
daydummies2 = days_new.copy()
daydummies2 = pd.get_dummies(daydummies2, columns=['day'])

# 다시 one-hot encoding해주기
monthdummies2 = months_new.copy()
monthdummies2 = pd.get_dummies(monthdummies2, columns=['month'])

### FM 변수 인코딩

In [66]:
# User encoding
user_dict = {}
for i in set(ratings['user_id']):
    user_dict[i] = len(user_dict)
n_user = len(user_dict)

# Item encoding
item_dict = {}
start_point = n_user
for i in set(ratings['item_id']):
    item_dict[i] = start_point + len(item_dict)
n_item = len(item_dict)
start_point += n_item

# vote encoding
vote_dict = {}
for i in set(ratings['votes']):
    vote_dict[i] = start_point + len(vote_dict)
n_vote = len(vote_dict)
start_point += n_vote

#year encoding
year_dict = {}
for i in set(ratings['years']):
    year_dict[i] = start_point + len(year_dict)
n_year = len(year_dict)
start_point += n_year

# sentiment encoding
sentiment_dict = {}
for i in set(ratings['sentiment']):
    sentiment_dict[i] = start_point + len(sentiment_dict)
n_sentiment = len(sentiment_dict)
start_point += n_sentiment

In [67]:
x = shuffle(ratings)

num_x = start_point             # Total number of x
num_x

10097

In [68]:
w0 = np.mean(x['rating'])
def generate_x(x) :
    # Generate X data
    data = []
    y = []
    w0 = np.mean(x['rating'])
    for i in range(len(x)):
        case = x.iloc[i]
        x_index = []
        x_value = []
        x_index.append(user_dict[case['user_id']])     # User id encoding
        x_value.append(1.)
        x_index.append(item_dict[case['item_id']])    # item id encoding
        x_value.append(1.)
        x_index.append(vote_dict[case['votes']])   # vote encoding
        x_value.append(1.)
        x_index.append(year_dict[case['years']])       # year encoding
        x_value.append(1.)
        x_index.append(sentiment_dict[case['sentiment']]) #sentiment encoding
        x_value.append(1.)
        #x_index.append(day_dict[case['day']]) #day encoding
        #x_value.append(1.)

        data.append([x_index, x_value])
        y.append(case['rating'] - w0)
        if (i % 10000) == 0:
            print('Encoding ', i, ' cases...')

    
    return data, y 

### 데이터 분리

In [69]:
N = len(set(ratings.user_id)) + 1      # Number of users
M = len(set(ratings.item_id)) + 1      # Number of items

# train test 분리
TRAIN_SIZE = 0.7
ratings = shuffle(ratings)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]
context_train = pd.merge(ratings_train, daydummies2, on='item_id')      # Adding context variables
context_train = pd.merge(context_train, monthdummies2, on='item_id')
context_train = context_train.drop(['item_id', 'user_id', 'rating', 'votes', 'years', 'sentiment', 'day', 'month'], axis=1)
context_test = pd.merge(ratings_test, daydummies2, on='item_id')
context_test = pd.merge(context_test, monthdummies2, on='item_id')
context_test = context_test.drop(['user_id', 'item_id', 'rating', 'votes', 'years', 'sentiment', 'day', 'month'], axis=1)

ratings = ratings.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0)

In [70]:
train_data, train_y = generate_x(ratings_train) 

Encoding  0  cases...
Encoding  10000  cases...
Encoding  20000  cases...
Encoding  30000  cases...
Encoding  40000  cases...
Encoding  50000  cases...
Encoding  60000  cases...


In [71]:
test_data, test_y = generate_x(ratings_test) 

Encoding  0  cases...
Encoding  10000  cases...
Encoding  20000  cases...


## Hybrid

In [72]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [73]:
# 첫번째 모델 코드 넣기
class FM():
    def __init__(self, N, K, train_data, test_data, train_y, test_y, alpha, beta, iterations=100, tolerance=0.005, l2_reg=True, verbose=True):
        self.K = K          # Number of latent factors
        self.N = N          # Number of x (variables)
        # self.n_cases = len(data)            # N of observations
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.l2_reg = l2_reg
        self.tolerance = tolerance
        self.verbose = verbose
        # w 초기화
        self.w = np.random.normal(scale=1./self.N, size=(self.N))
        # v 초기화
        self.v = np.random.normal(scale=1./self.K, size=(self.N, self.K))
        # Train/Test 분리
        self.train_x = train_data
        self.test_x = test_data
        self.train_y = train_y
        self.test_y = test_y
   
    def test(self):                                     # Training 하면서 RMSE 계산 
        # SGD를 iterations 숫자만큼 수행
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        best_pred = []
        for i in range(self.iterations):
            rmse1 = self.sgd(self.train_x, self.train_y)                # SGD & Train RMSE 계산
            y_pred, rmse2 = self.test_rmse(self.test_x, self.test_y)    # Test RMSE 계산    
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                       # New best record
                best_RMSE = rmse2
                best_iteration = i
                best_pred = y_pred
            elif (rmse2 - best_RMSE) > self.tolerance:  # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process, best_pred, best_RMSE
        
    # w, v 업데이트를 위한 Stochastic gradient descent 
    def sgd(self, x_data, y_data):
        y_pred = []
        for data, y in zip(x_data, y_data):
            x_idx = data[0]
            x_0 = np.array(data[1])     # xi axis=0 [1, 2, 3]
            x_1 = x_0.reshape(-1, 1)    # xi axis=1 [[1], [2], [3]]
    
            # biases
            bias_score = np.sum(self.w[x_idx] * x_0)
            
            # score 계산
            vx = self.v[x_idx] * (x_1)          # v matrix * x
            sum_vx = np.sum(vx, axis=0)         # sigma(vx)
            sum_vx_2 = np.sum(vx * vx, axis=0)  # ( v matrix * x )의 제곱
            latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

            # 예측값 계산
            y_hat = bias_score + latent_score
            y_pred.append(y_hat)
            error = y - y_hat
            # w, v 업데이트
            if self.l2_reg:     # regularization이 있는 경우
                self.w[x_idx] += error * self.alpha * (x_0 - self.beta * self.w[x_idx])
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1) - self.beta * self.v[x_idx])
            else:               # regularization이 없는 경우
                self.w[x_idx] += error * self.alpha * x_0
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1))
        return RMSE(y_data, y_pred)
            
    def test_rmse(self, x_data, y_data):
        y_pred = []
        for data , y in zip(x_data, y_data):
            y_hat = self.predict(data[0], data[1])
            y_pred.append(y_hat)
        return np.array(y_pred), RMSE(y_data, y_pred)

    def predict(self, idx, x):
        # idx = self.user_id_index[idx]
        # x = self.item_id_index[x]
        x_0 = np.array(x)
        x_1 = x_0.reshape(-1, 1)

        # biases
        bias_score = np.sum(self.w[idx] * x_0)

        # score 계산
        vx = self.v[idx] * (x_1)
        sum_vx = np.sum(vx, axis=0)
        sum_vx_2 = np.sum(vx * vx, axis=0)
        latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

        # 예측값 계산
        y_hat = bias_score + latent_score
        return y_hat

    def predict_one(self, user_id, item_id):
        x_idx = np.array([user_dict[user_id], item_dict[item_id]])
        x_data = np.array([1, 1])
        return self.predict(x_idx, x_data) + w0
# FM
K = 260
fm1 = FM(num_x, K, train_data, test_data, train_y, test_y, alpha=0.0001, beta=0.007, iterations=900, tolerance=0.0005)
training_process, result0, rmse = fm1.test()  
# print('FM RMSE:' , rmse)

Iteration: 10 ; Train RMSE = 0.962119 ; Test RMSE = 0.963537
Iteration: 20 ; Train RMSE = 0.946699 ; Test RMSE = 0.952122
Iteration: 30 ; Train RMSE = 0.937488 ; Test RMSE = 0.946686
Iteration: 40 ; Train RMSE = 0.926146 ; Test RMSE = 0.940024
Iteration: 50 ; Train RMSE = 0.910598 ; Test RMSE = 0.931214
Iteration: 60 ; Train RMSE = 0.890165 ; Test RMSE = 0.920540
Iteration: 70 ; Train RMSE = 0.866397 ; Test RMSE = 0.909659
Iteration: 80 ; Train RMSE = 0.842373 ; Test RMSE = 0.900701
Iteration: 90 ; Train RMSE = 0.820964 ; Test RMSE = 0.895040
Iteration: 100 ; Train RMSE = 0.803398 ; Test RMSE = 0.892708
Iteration: 110 ; Train RMSE = 0.789166 ; Test RMSE = 0.892799
103 0.8925152955860238


In [74]:
# 두번째 모델 Best-seller 코드 
def bestseller(user_id, item_id):
    train_mean = ratings_train.groupby(['item_id'])['rating'].mean()
    try:
        rating = train_mean[item_id]
    except:
        rating = 4.4
    return rating

### 상황변수 고려한 추천

In [75]:
# Context-aware recommendation ###########################################################################
def recommender0(recomm_list, mf):
    recommendations = []
    for i in range(len(recomm_list)):
        recommendations.append(fm1.predict_one(recomm_list[i,1], recomm_list[i,0]))
    return np.array(recommendations)

def recommender1(recomm_list): # bs
    id_pairs = zip(recomm_list[:, 1], recomm_list[:, 0])
    recommendations = np.array([bestseller(user, item) for (user, item) in id_pairs])
    return recommendations

# RMSE 계산을 위한 함수
def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

recomm_list = np.array(ratings_train.iloc[:, [0, 1]])       # Data for training context-DL model
train0 = recommender0(recomm_list, fm1)
train1 = recommender1(recomm_list)

recomm_list = np.array(ratings_test.iloc[:, [0, 1]])        # Data for testing context-DL model
test0 = recommender0(recomm_list, fm1)
test1 = recommender1(recomm_list)


In [76]:
# Context variable을 사용한 추천엔진 결합 ##################################################################

# Defining RMSE measure
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

recomm0 = Input(shape=(1,))                                           # User input
recomm1 = Input(shape=(1,))                                           # Item input
context = Input(shape=(19,))
r0_layer = Dense(19)(recomm0)                                          # Recommender 1
r0_layer = Activation('linear')(r0_layer)
r0_layer = Flatten()(r0_layer)
r1_layer = Dense(19)(recomm1)                                          # Recommender 2
r1_layer = Activation('linear')(r1_layer)
r1_layer = Flatten()(r1_layer)
context_layer = Embedding(19,1)(context)
context_layer = Dense(1)(context_layer)                               # Context variables
context_layer = Activation('softmax')(context_layer)
context_layer = Flatten()(context_layer)

R = Concatenate()([r0_layer, r1_layer])
interaction0_layer = Multiply()([r0_layer, context_layer])
interaction1_layer = Multiply()([r1_layer, context_layer])

# Neural network
R = Dense(1024)(R)
R = Activation('linear')(R)
#R = Dropout(0.001)(R)

R = Concatenate()([R, context_layer, interaction0_layer, interaction1_layer])

# Adding more layers
R = Dense(512)(R)
R = Activation('tanh')(R)
R = Dropout(0.02)(R)

R = Dense(256)(R)
R = Activation('tanh')(R)
R = Dropout(0.02)(R)

R = Dense(1)(R)

model = Model(inputs=[recomm0, recomm1, context], outputs=R)
model.compile(
  loss=RMSE,
  #optimizer=SGD(lr=0.001, momentum=0.95),
  optimizer=Adamax(lr=0.0000002),
  metrics=['mean_squared_error', RMSE]
)

from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10, mode = 'auto')

result = model.fit(
  x=[train0, train1, context_train],
  y=ratings_train.rating.values.astype(np.float64),
  epochs=100,
  batch_size=32,
  callbacks = [early_stopping],
  validation_data=(
    [test0, test1, context_test],
    ratings_test.rating.values.astype(np.float64)
  ),
)


2022-12-04 19:26:00.384870: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-12-04 19:26:00.384897: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-12-04 19:26:00.385312: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(nam

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


In [77]:
# Prediction
predictions = np.ravel(model.predict([test0, test1, context_test]), order='C')
predictions[predictions > 5] = 5
predictions[predictions < 1] = 1
print(RMSE2(np.array(ratings_test['rating']), predictions))   # 상황변수
print(RMSE2(np.array(ratings_test['rating']), test0))         # fm
print(RMSE2(np.array(ratings_test['rating']), test1))         # bs


0.9450920364758103
0.9739438535196879
0.9552511026108002


# rmse 3번

In [1]:
context1 = 0.9450920364758103
context2 = 0.948125174919919
context3 = 0.9426092006574679

print((context1+context2+context3) / 3)

0.9452754706843991
