In [17]:
import numpy as np
import pandas as pd
import os
import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

#surprise
from surprise import SVD, accuracy, SVDpp, KNNWithMeans, BaselineOnly
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
#파라미터 설정
data_path = 'storage/shop_data/interaction_data.csv'
_num_recommendations=10
item_id_var = 'ITEM_ID'
user_id_var = 'USER_ID'
event = 'EVENT_TYPE'
timestamp = 'TIMESTAMP'
user_var = ['USER_NAME','AGE','GENDER']
item_var = ['ITEM_NAME','CATEGORY_L1','STYLE','ITEM_DESCRIPTION','PRICE']

In [4]:
# data_path = 'storage/movie_data/interaction_data.csv'
# _num_recommendations=10
# item_id_var = 'ITEM_ID'
# user_id_var = 'USER_ID'
# event = 'EVENT_TYPE'
# timestamp = 'TIMESTAMP'
# user_var = []
# item_var = ['ITEM_NAME', 'CATEGORY_L1']

In [5]:
data = pd.read_csv(data_path)
data[item_id_var] = data[item_id_var].astype(str)
data[user_id_var] = data[user_id_var].astype(str)

#user 데이터셋 생성
user_df = data[[user_id_var]+user_var].drop_duplicates(user_id_var).reset_index(drop=True)

#item 데이터셋 생성
item_df = data[[item_id_var]+item_var].drop_duplicates(item_id_var).reset_index(drop=True)

#interaction 데이터셋 생성
interaction_df=data[[item_id_var, user_id_var, event, timestamp]]
if data[event].dtypes != float:
    interaction_df.loc[:, event] = 5

rating_min = 0.5
rating_max = np.max(interaction_df[event].values)

In [6]:
rating_max

5

In [19]:
reader = Reader(rating_scale = (0, rating_max))
df = Dataset.load_from_df(interaction_df[[user_id_var, item_id_var, event]], 
                          reader = reader)
train, test = train_test_split(df, test_size = 0.2, random_state=42)

In [20]:
# # 최적화할 파라미터들을 딕셔너리 형태로 지정.
# param_grid = {'n_epochs': [20, 40, 60], 'n_factors': [50, 100, 200] }
 
# # CV를 3개 폴드 세트로 지정, 성능 평가는 rmse, mse 로 수행 하도록 GridSearchCV 구성
# gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# gs.fit(df)
 
# # 최고 RMSE Evaluation 점수와 그때의 하이퍼 파라미터
# print(gs.best_score['rmse'])
# print(gs.best_params['rmse'])

In [22]:
algorithms = [BaselineOnly(), KNNWithMeans(), SVD(), SVDpp()]
results = []
for algo in algorithms:

    algo.fit(train)
    predictions = algo.test(test)
    results.append(accuracy.rmse(predictions))

Estimating biases using als...
RMSE: 0.0000
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.0000
RMSE: 0.0160
RMSE: 0.0089


In [23]:
results

[0.0, 0.0, 0.015997136286369444, 0.00887851542644517]

In [9]:
predictions = algo.test(test)
accuracy.rmse(predictions)

RMSE: 0.0160


0.01597006220637567

In [11]:
def get_predictions(x):
    result = algo.predict(x[user_id_var], x[item_id_var]).est
    return result

def intersect(a, b):
    return list(set(a) & set(b))
    
#아이템별 precision@k와 recall@k의 값을 알려줌
def precision_recall(preds_df, item_df, interaction_df, num, user_id):
    
    #유저 아이디 df
    user_id_df = interaction_df[interaction_df[user_id_var]==user_id]
    
    #유저 아이디가 본 아이텤 df
    user_item_df = item_df[item_df[item_id_var].isin(user_id_df[item_id_var].tolist())]
    user_item_list = user_item_df[item_id_var].tolist()
    
    #총 추천 아이템 추출
    sorted_user_predictions = preds_df.sort_values('predictions',ascending=False)
    
    recommend_total_list_df = sorted_user_predictions.head(num)
    recommend_total_list = recommend_total_list_df[item_id_var].tolist()
    
    #순수 추천 아이템 리스트
    recommend_df_list = sorted_user_predictions[~sorted_user_predictions[item_id_var].isin(user_item_list)].head(num)[item_id_var].values.tolist()
    recommend_df = item_df[item_df[item_id_var].isin(recommend_df_list)]
    
    #지표 산정
    precision = len(intersect(user_item_list, recommend_total_list))/ len(recommend_total_list)
    recall = len(intersect(user_item_list, recommend_total_list)) / len(user_item_list)
    
    return precision, recall, user_item_df, recommend_df


def perf_metric(user_df, item_df, interaction_df, num):
    
    user_item_dfs = pd.DataFrame()
    recommend_dfs = pd.DataFrame()
    dict_list = []
    for USER_ID in tqdm.tqdm(user_df[user_id_var].values):
    #for USER_ID in tqdm.tqdm(['1']): #,'2','3','4']):
        preds_df = item_df.copy()
        preds_df.loc[:, user_id_var] = USER_ID
        #preds_df 계산
        preds_df.loc[:, 'predictions'] = preds_df.apply(get_predictions, axis=1)
        precision, recall, user_item_df, recommend_df = precision_recall(preds_df, item_df, interaction_df, num, USER_ID)
        dict_ = {}
        dict_.update({"userID" : USER_ID})
        dict_.update({f"precision@{num}" : precision})
        dict_.update({f"recall@{num}" : recall})
        dict_list.append(dict_)
        
        #이미 user가 선정했던 item 확인
        user_item_df = user_item_df.copy()
        user_item_df.loc[:, user_id_var] = USER_ID
        user_item_dfs= pd.concat([user_item_dfs, user_item_df], axis=0)
    
    
        #상위 추천 item 중 이미 user가 선정했던 item 제외 후 추천 목록 추출
        recommend_df = recommend_df.copy()
        recommend_df.loc[:, user_id_var] = USER_ID
        recommend_dfs = pd.concat([recommend_dfs, recommend_df], axis=0)
    accuracy_df = pd.DataFrame(dict_list)
    return accuracy_df, user_item_dfs, recommend_dfs
accuracy_df, user_item_df, recommendations_df = perf_metric(user_df, item_df, interaction_df, num = 10) 

100%|██████████████████████████████████████████████████████████████████████████████| 5250/5250 [05:54<00:00, 14.82it/s]


In [12]:
print(user_item_df.shape)
print(recommendations_df.shape)

(67343, 7)
(52500, 7)


In [13]:
print('TASK : 사용자별 상품 추천')
print('분석 알고리즘 : 상품-사용자 연계 추천')
print(f'사용자 변수: {user_id_var} ({interaction_df[user_id_var].nunique()})')
print(f'사용자 변수: {item_id_var} ({interaction_df[item_id_var].nunique()})')
print(f'모델 훈련 개수 : {len(interaction_df)}개')
print(f'모델 검증 개수 : {int(len(interaction_df)*0.1)}개') #본사업에서 따로 구현 X
print('검증 기준 : 모든 사용자의 최근 사용(Timestamp) 내역')
print('성능 지표 : Precision@K, Recall@K')
print(f"Precision@K : {np.round(np.mean(accuracy_df['precision@10']),3)}")
print(f"Recall@K    : {np.round(np.mean(accuracy_df['recall@10']),3)}")

TASK : 사용자별 상품 추천
분석 알고리즘 : 상품-사용자 연계 추천
사용자 변수: USER_ID (5250)
사용자 변수: ITEM_ID (2449)
모델 훈련 개수 : 587719개
모델 검증 개수 : 58771개
검증 기준 : 모든 사용자의 최근 사용(Timestamp) 내역
성능 지표 : Precision@K, Recall@K
Precision@K : 0.006
Recall@K    : 0.005
