In [18]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

#surprise
from surprise import SVD, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

In [13]:
data_path = 'movie_data/interaction_data.csv'
_num_recommendations=10
_viewer_threshold=5
item_id_var = 'ITEM_ID'
user_id_var = 'USER_ID'
event = 'EVENT_TYPE'
timestamp = 'TIMESTAMP'

In [73]:
data = pd.read_csv(data_path)
data[item_id_var] = data[item_id_var].astype(str)
data[user_id_var] = data[user_id_var].astype(str)

#interaction 데이터셋 생성
interaction_df=data[[item_id_var, user_id_var, event, timestamp]]
if data[event].dtypes != float:
    interaction_df.loc[:, event] = 1



#user 데이터셋 생성
user_df = data[[user_id_var]].drop_duplicates(user_id_var).reset_index(drop=True)

#item 데이터셋 생성
item_df = data[[item_id_var]].drop_duplicates(item_id_var).reset_index(drop=True)

In [74]:
reader = Reader(rating_scale = (0.5, 5))
df = Dataset.load_from_df(data[[user_id_var, item_id_var, event]], 
                          reader = reader)
train, test = train_test_split(df, test_size = 0.25, random_state=42)

In [75]:
algo = SVD()
algo.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d31e46f348>

In [76]:
predictions = algo.test(test)
accuracy.rmse(predictions)

RMSE: 0.8549


0.854936306904799

In [None]:
# cross_validate(algo, df, measures=['RMSE', 'MAE'], cv = 5, verbose=True)
# from surprise.model_selection import cross_validate
# from surprise.model_selection import GridSearchCV

# param_grid = {'n_epochs' : [20, 40], 'n_factors': [50, 100, 200]}
# grid = GridSearchCV(SVD, param_grid = param_grid, measures = ['rmse', 'mae'], cv=3)
# grid.fit(df)
# print(grid.best_score['rmse'])
# print(grid.best_params['rmse'])

In [77]:
USER_ID = '11'
predictions = [[algo.predict(USER_ID, ITEM_ID).uid, algo.predict(USER_ID, ITEM_ID).iid, algo.predict(USER_ID, ITEM_ID).est] 
               for ITEM_ID in item_df[item_id_var].values]
preds_df = pd.DataFrame(predictions, columns = [user_id_var, item_id_var, 'predictions'])

In [93]:
def intersect(a, b):
    return list(set(a) & set(b))

#USER_ID별 precision, recall 구하기
def precision_recall(preds_df, interaction_df, item_df, USER_ID, num_recommendations):
    predictions = [[algo.predict(USER_ID, ITEM_ID).uid, algo.predict(USER_ID, ITEM_ID).iid, algo.predict(USER_ID, ITEM_ID).est] 
               for ITEM_ID in item_df[item_id_var].values]
    preds_df = pd.DataFrame(predictions, columns = [user_id_var, item_id_var, 'predictions'])
    
    #상위 예측치 데이터프레임 추출
    sorted_predictions = preds_df.sort_values('predictions', ascending=False).head(num_recommendations)
    
    #유저 데이터 추출
    user_data = interaction_df[interaction_df[user_id_var]==USER_ID]
    
    #유저 데이터에서 유저가 본 아이템 추출
    user_item_list = user_data[item_id_var].tolist()

    #추천 아이템 추출
    recommended_total_list = sorted_predictions[item_id_var].tolist()
    
    #순수추천 아이템 추출
    recommended_list = list(set(recommended_total_list) - set(user_item_list))

    precision = len(intersect(user_item_list, recommended_item_list))/ len(recommended_item_list)
    recall = len(intersect(user_item_list, recommended_item_list)) / len(user_item_list)
    
    return precision, recall, user_item_list, recommended_list

In [94]:
precision, recall, user_item_list, recommended_list = precision_recall(preds_df, item_df, interaction_df, USER_ID, num_recommendations=10)

In [86]:
def perf_metric(preds_df, interaction_df, item_df, user_df,  num_recommendations):
    
    user_item_df = pd.DataFrame()
    recommendations_df = pd.DataFrame()
    dict_list = []
    for user_id in tqdm.tqdm(interaction_df[user_id_var].unique()):
        precision, recall, user_item_list, recommended_list = precision_recall(preds_df, interaction_df, item_df
                                                                               USER_ID = user_id, num_recommendations)
        dict_ = {}
        dict_.update({"userID" : user_id})
        dict_.update({f"precision@{num_recommendations}" : precision})
        dict_.update({f"recall@{num_recommendations}" : recall})
        dict_list.append(dict_)
    accuracy_df = pd.DataFrame(dict_list)
    return accuracy_df

In [87]:
import tqdm

In [88]:
accuracy_df = perf_metric(preds_df, item_df, interaction_df, num_recommendations=_num_recommendations)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [16:57<00:00,  9.83it/s]


In [None]:
accuracy_df

In [None]:
print('TASK : 사용자별 상품 추천')
print('분석 알고리즘 : 상품-사용자 연계 추천')
print(f'사용자 변수: {user_id_var} ({interaction_df[user_id_var].nunique()})')
print(f'사용자 변수: {item_id_var} ({interaction_df[item_id_var].nunique()})')
print(f'모델 훈련 개수 : {len(interaction_df)}개')
print(f'모델 검증 개수 : {int(len(interaction_df)*0.1)}개') #본사업에서 따로 구현 X
print('검증 기준 : 모든 사용자의 최근 사용(Timestamp) 내역')
print('성능 지표 : Precision@K, Recall@K')
print(f"Precision@K : {np.round(np.mean(accuracy_df['precision@10']),3)}")
print(f"Recall@K    : {np.round(np.mean(accuracy_df['recall@10']),3)}")

In [72]:
USER_ID = 1
def make_recommendations(preds_df, interaction_df, item_df, USER_ID):
    #USER_ID 별 상위 ITEM 추천 목록 생성
    sorted_predictions = preds_df.sort_values('predictions', ascending=False)
    sorted_predictions = pd.DataFrame(sorted_predictions.reset_index())
    sorted_predictions.columns = ['ITEM_ID', 'Predictions']

    #이미 USER_ID가 선정했던 ITEM 확인
    user_data = interaction_df[interaction_df[user_id_var]==USER_ID]
    user_item = item_df[item_df[item_id_var].isin(user_data[item_id_var].tolist())]
    #print('User {0} has already viewed or buyed {1} items'.format(USER_ID, user_item.shape[0]))

    #상위 추천 ITEM중 이미 USER_ID가 선정했던 ITEM 제외 후 추천 목록 추출
    recommended_items = sorted_predictions.index.tolist()
    recommendations = item_df[~item_df[item_id_var].isin(user_item[item_id_var])].merge(sorted_predictions, how='inner',on=item_id_var)
    recommendations = recommendations.sort_values('Predictions', ascending=False).head(_num_recommendations)

    return user_item.reset_index(drop=True), recommendations.reset_index(drop=True)

user_item, recommendations = make_recommendations(preds_df, interaction_df, item_df, USER_ID)

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat