In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

#surprise
from surprise import SVD, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

In [2]:
data_path = 'storage/movie_data/interaction_data.csv'
_num_recommendations=10
_viewer_threshold=5
item_id_var = 'ITEM_ID'
user_id_var = 'USER_ID'
event = 'EVENT_TYPE'
timestamp = 'TIMESTAMP'

In [3]:
data = pd.read_csv(data_path)
data[item_id_var] = data[item_id_var].astype(str)
data[user_id_var] = data[user_id_var].astype(str)

#interaction 데이터셋 생성
interaction_df=data[[item_id_var, user_id_var, event, timestamp]]
if data[event].dtypes != float:
    interaction_df.loc[:, event] = 1

#user 데이터셋 생성
user_df = data[[user_id_var]].drop_duplicates(user_id_var).reset_index(drop=True)

#item 데이터셋 생성
item_df = data[[item_id_var]].drop_duplicates(item_id_var).reset_index(drop=True)

In [60]:
interaction_df.dtypes

ITEM_ID         object
USER_ID         object
EVENT_TYPE     float64
TIMESTAMP       object
predictions    float64
dtype: object

In [4]:
reader = Reader(rating_scale = (0.5, 5))
df = Dataset.load_from_df(data[[user_id_var, item_id_var, event]], 
                          reader = reader)
train, test = train_test_split(df, test_size = 0.25, random_state=42)

In [5]:
algo = SVD()
algo.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x28d60abb208>

In [6]:
predictions = algo.test(test)
accuracy.rmse(predictions)

RMSE: 0.8543


0.8543272521921764

In [None]:
# cross_validate(algo, df, measures=['RMSE', 'MAE'], cv = 5, verbose=True)
# from surprise.model_selection import cross_validate
# from surprise.model_selection import GridSearchCV

# param_grid = {'n_epochs' : [20, 40], 'n_factors': [50, 100, 200]}
# grid = GridSearchCV(SVD, param_grid = param_grid, measures = ['rmse', 'mae'], cv=3)
# grid.fit(df)
# print(grid.best_score['rmse'])
# print(grid.best_params['rmse'])

In [7]:
def get_predictions(x):
    result = algo.predict(x[user_id_var], x[item_id_var]).est
    return result

In [8]:
interaction_df['predictions'] = interaction_df.apply(get_predictions, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [9]:
interaction_df

Unnamed: 0,ITEM_ID,USER_ID,EVENT_TYPE,TIMESTAMP,predictions
0,307,1,3.5,2009-10-27 21:00:21,3.945435
1,481,1,3.5,2009-10-27 21:04:16,3.108798
2,1091,1,1.5,2009-10-27 21:04:31,2.377546
3,1257,1,4.5,2009-10-27 21:04:20,3.876572
4,1449,1,4.5,2009-10-27 21:01:04,3.968619
...,...,...,...,...,...
994877,183843,10000,4.0,2018-09-23 10:59:17,3.380715
994878,183869,10000,4.0,2018-08-25 22:48:12,3.204030
994879,184721,10000,4.0,2018-08-10 20:47:47,3.180112
994880,185029,10000,2.5,2018-08-10 17:34:01,3.365124


In [89]:
preds_df = pd.DataFrame(columns = [item_id_var, user_id_var])
for iid, uid in zip(item_df[item_id_var].unique(), user_df[user_id_var].unique()):
    print(iid, uid)
    preds_df.loc[:, item_id_var] = iid
    preds_df.loc[:, user_id_var] = uid
    break

307 1


In [None]:
pred_list_df.head()

In [128]:
#아이템별 precision@k와 recall@k의 값을 알려줌
def precision_recall(preds_df, item_df, interaction_df, num, user_id):

    def intersect(a, b):
        return list(set(a) & set(b))
    
    
    #유저 아이디 df
    user_id_df = interaction_df[interaction_df[user_id_var]==user_id]
    
    #유저 아이디가 본 아이텤 df
    user_item_df = item_df[item_df[item_id_var].isin(user_id_df[item_id_var].tolist())]
    user_item_list = user_item_df[item_id_var].tolist()
    
    #추천 아이템 추출
    sorted_user_predictions = preds_df.sort_values('predictions',ascending=False)
    #sorted_user_predictions.columns = [item_id_var, 'predictions']
    
    recommend_total_list_df = sorted_user_predictions.head(num)
    recommend_total_list = recommend_total_list_df[item_id_var].tolist()
    
    #순수 추천 아이템 리스트
    recommend_df = item_df[~item_df[item_id_var].isin(user_item_list)]
    recommend_df = recommend_df.merge(sorted_user_predictions, how='inner', on=item_id_var).sort_values('predictions', ascending=False).head(num)
    
   
    
    #지표 산정
    precision = len(intersect(user_item_list, recommend_total_list))/ len(recommend_total_list)
    recall = len(intersect(user_item_list, recommend_total_list)) / len(user_item_list)
    
    return precision, recall, user_item_df, recommend_df


def perf_metric(user_df, item_df, interaction_df, num):
    
    user_item_dfs = pd.DataFrame()
    recommend_dfs = pd.DataFrame()
    dict_list = []
    for USER_ID in tqdm.tqdm(user_df[user_id_var].values):
    #for USER_ID in tqdm.tqdm(['1','2','3','4']):
        preds_df = item_df.copy()
        preds_df.loc[:, user_id_var] = USER_ID
        preds_df.loc[:, 'predictions'] = preds_df.apply(get_predictions, axis=1)
        #preds_df = interaction_df[interaction_df[user_id_var]==USER_ID][[user_id_var, item_id_var, 'predictions']]
        precision, recall, user_item_df, recommend_df = precision_recall(preds_df, item_df, interaction_df, num, USER_ID)
        dict_ = {}
        dict_.update({"userID" : USER_ID})
        dict_.update({f"precision@{num}" : precision})
        dict_.update({f"recall@{num}" : recall})
        dict_list.append(dict_)
        
        #이미 user가 선정했던 item 확인
        user_item_df.loc[:, user_id_var] = USER_ID
        user_item_dfs= pd.concat([user_item_dfs, user_item_df], axis=0)
    
    
        #상위 추천 item 중 이미 user가 선정했던 item 제외 후 추천 목록 추출
        recommend_df.loc[:, user_id_var] = USER_ID
        recommend_dfs = pd.concat([recommend_dfs, recommend_df], axis=0)
    accuracy_df = pd.DataFrame(dict_list)
    return accuracy_df, user_item_dfs, recommend_dfs

In [129]:
accuracy_df, user_item_df, recommendations_df = perf_metric(user_df, item_df, interaction_df, num = 10) 

  0%|                                                                              | 5/10000 [00:09<5:14:35,  1.89s/it]


KeyboardInterrupt: 

In [123]:
accuracy_df

Unnamed: 0,userID,precision@10,recall@10
0,1,0.0,0.0
1,2,0.0,0.0
2,3,0.0,0.0
3,4,1.0,0.013587


In [78]:
preds_df = interaction_df[interaction_df[user_id_var]=='1'][[user_id_var, item_id_var, 'predictions']]
sorted_user_predictions = preds_df.sort_values('predictions',ascending=False)
recommend_df = item_df[~item_df[item_id_var].isin(['307', '481', '1091', '1257', '1449', '1590', '1591', '2134', '2478', '2840', '2986', '3020', '3424', '3698', '3826', '3893'])]
#recommend_df = pd.merge(recommend_df, sorted_user_predictions, how='inner', on=item_id_var)

In [101]:
interaction_df[interaction_df[user_id_var]=='1']

Unnamed: 0,ITEM_ID,USER_ID,EVENT_TYPE,TIMESTAMP,predictions
0,307,1,3.5,2009-10-27 21:00:21,3.945435
1,481,1,3.5,2009-10-27 21:04:16,3.108798
2,1091,1,1.5,2009-10-27 21:04:31,2.377546
3,1257,1,4.5,2009-10-27 21:04:20,3.876572
4,1449,1,4.5,2009-10-27 21:01:04,3.968619
5,1590,1,2.5,2009-10-27 21:00:36,3.042392
6,1591,1,1.5,2009-10-27 21:04:35,2.430086
7,2134,1,4.5,2009-10-27 21:04:24,3.633599
8,2478,1,4.0,2009-10-27 21:00:39,2.96921
9,2840,1,3.0,2009-10-27 21:05:00,3.104363


In [110]:
len(item_df)

22011

In [17]:
#아이템별 precision@k와 recall@k의 값을 알려줌
def precision_recall(preds_df, item_df, interaction_df, num, user_id):

    def intersect(a, b):
        return list(set(a) & set(b))
    
    
    #유저 아이디 df
    user_id_df = interaction_df[interaction_df[user_id_var]==user_id]
    
    #유저 아이디가 본 아이텤 df
    user_item_df = item_df[item_df[item_id_var].isin(user_id_df[item_id_var].tolist())]
    user_item_list = user_item_df[item_id_var].tolist()
    
    #추천 아이템 추출
    sorted_user_predictions = pd.DataFrame(preds_df.loc[user_id, :].sort_values(ascending=False).reset_index())
    sorted_user_predictions.columns = [item_id_var, 'Predictions']
    
    recommend_total_list_df = sorted_user_predictions.head(10)
    recommend_total_list = recommend_total_list_df[item_id_var].tolist()
    
    #순수 추천 아이템 리스트

    recommend_df = item_df[~item_df[item_id_var].isin(user_item_df[item_id_var])].merge(sorted_user_predictions, how='inner',on=item_id_var).sort_values('Predictions', ascending=False).head(10)
    
    
    #지표 산정
    precision = len(intersect(user_item_list, recommend_total_list))/ len(recommend_total_list)
    recall = len(intersect(user_item_list, recommend_total_list)) / len(user_item_list)
    
    return precision, recall, user_item_df, recommend_df

In [None]:
import tqdm
def perf_metric(interaction_df, item_df, num_recommendations):
    
    user_item_df = pd.DataFrame()
    recommendations_df = pd.DataFrame()
    dict_list = []
    
    for USER_ID in tqdm.tqdm(interaction_df[user_id_var].unique()):
    #for USER_ID in tqdm.tqdm(['1','2','3']):
#         predictions = [[algo.predict(USER_ID, ITEM_ID).uid, algo.predict(USER_ID, ITEM_ID).iid, algo.predict(USER_ID, ITEM_ID).est] 
#                         for ITEM_ID in item_df[item_id_var].values]
#         preds_df = pd.DataFrame(predictions, columns = [user_id_var, item_id_var, 'predictions'])
        
        
        preds_df = interaction_df[interaction_df[user_id_var]==USER_ID][[user_id_var, item_id_var, 'predictions']]
        print(preds_df)                          
        #preision_recall
        precision, recall, user_item_list, recommended_list = precision_recall(preds_df, interaction_df, item_df, num_recommendations, user_id = USER_ID)
        dict_ = {}
        dict_.update({"userID" : USER_ID})
        dict_.update({f"precision@{num_recommendations}" : precision})
        dict_.update({f"recall@{num_recommendations}" : recall})
        dict_list.append(dict_)
        
        #user_item_df 만들기
        user_item = item_df[item_df[item_id_var].isin(user_item_list)]
        user_item.loc[: , user_id_var] = USER_ID
        user_item_df = pd.concat([user_item_df, user_item], axis=0)
        print(USER_ID)
        print(type(USER_ID))
        #recommendations_df 만들기
        recommendations = item_df[item_df[item_id_var].isin(recommended_list)]
        print(recommendations)
        recommendations.loc[:, user_id_var] = USER_ID
        recommendations_df = pd.concat([recommendations_df, recommendations], axis =0)
    
    accuracy_df = pd.DataFrame(dict_list)
    
    return accuracy_df, user_item_df, recommendations_df

In [None]:
accuracy_df

In [None]:
print('TASK : 사용자별 상품 추천')
print('분석 알고리즘 : 상품-사용자 연계 추천')
print(f'사용자 변수: {user_id_var} ({interaction_df[user_id_var].nunique()})')
print(f'사용자 변수: {item_id_var} ({interaction_df[item_id_var].nunique()})')
print(f'모델 훈련 개수 : {len(interaction_df)}개')
print(f'모델 검증 개수 : {int(len(interaction_df)*0.1)}개') #본사업에서 따로 구현 X
print('검증 기준 : 모든 사용자의 최근 사용(Timestamp) 내역')
print('성능 지표 : Precision@K, Recall@K')
print(f"Precision@K : {np.round(np.mean(accuracy_df['precision@10']),3)}")
print(f"Recall@K    : {np.round(np.mean(accuracy_df['recall@10']),3)}")

In [72]:
USER_ID = 1
def make_recommendations(preds_df, interaction_df, item_df, USER_ID):
    #USER_ID 별 상위 ITEM 추천 목록 생성
    sorted_predictions = preds_df.sort_values('predictions', ascending=False)
    sorted_predictions = pd.DataFrame(sorted_predictions.reset_index())
    sorted_predictions.columns = ['ITEM_ID', 'Predictions']

    #이미 USER_ID가 선정했던 ITEM 확인
    user_data = interaction_df[interaction_df[user_id_var]==USER_ID]
    user_item = item_df[item_df[item_id_var].isin(user_data[item_id_var].tolist())]
    #print('User {0} has already viewed or buyed {1} items'.format(USER_ID, user_item.shape[0]))

    #상위 추천 ITEM중 이미 USER_ID가 선정했던 ITEM 제외 후 추천 목록 추출
    recommended_items = sorted_predictions.index.tolist()
    recommendations = item_df[~item_df[item_id_var].isin(user_item[item_id_var])].merge(sorted_predictions, how='inner',on=item_id_var)
    recommendations = recommendations.sort_values('Predictions', ascending=False).head(_num_recommendations)

    return user_item.reset_index(drop=True), recommendations.reset_index(drop=True)

user_item, recommendations = make_recommendations(preds_df, interaction_df, item_df, USER_ID)

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat