## CF - KNN

In [41]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings(action='ignore')

In [42]:
# 전처리 완료된 data 불러오기 
path = './data/'
ratings_cc = pd.read_csv(os.path.join(path, 'overall_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_caddie = pd.read_csv(os.path.join(path, 'caddie_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_course = pd.read_csv(os.path.join(path, 'course_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_price = pd.read_csv(os.path.join(path, 'price_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_facility = pd.read_csv(os.path.join(path, 'facility_rating_completed.csv'), encoding='utf-8', index_col=0)

In [43]:
# 데이터 확인
ratings_cc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24767 entries, 0 to 24766
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id_num    24767 non-null  int64  
 1   cc_num    24767 non-null  int64  
 2   cc_score  24767 non-null  float64
 3   cc_name   24767 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 967.5+ KB


In [44]:
# null 체크
ratings_cc.isnull().sum()

id_num      0
cc_num      0
cc_score    0
cc_name     0
dtype: int64

In [45]:
# pivot_table 사용해서 행렬 변환(중복 평점 시 최대값 적용)
ratings_cc_matrix = ratings_cc.pivot_table('cc_score', index='id_num', columns='cc_name', aggfunc='max')
ratings_caddie_matrix = ratings_caddie.pivot_table('caddie', index='id_num', columns='cc_name', aggfunc='max')
ratings_course_matrix = ratings_course.pivot_table('course', index='id_num', columns='cc_name', aggfunc='max')
ratings_price_matrix = ratings_price.pivot_table('price', index='id_num', columns='cc_name', aggfunc='max')
ratings_facility_matrix = ratings_facility.pivot_table('facility', index='id_num', columns='cc_name', aggfunc='max')

In [46]:
# 카테고리 별 반복 작업을 위한 변수 지정 
feature_list = ['cc', 'caddie', 'course', 'price', 'facility']
matrices_list = ['ratings_'+ feature_list[i] + '_matrix' for i in range(len(feature_list))]

In [47]:
# 리스트 확인
for i in range(len(feature_list)):
    print(feature_list[i], matrices_list[i])

cc ratings_cc_matrix
caddie ratings_caddie_matrix
course ratings_course_matrix
price ratings_price_matrix
facility ratings_facility_matrix


In [48]:
# NaN 값을 모두 0 으로 변환
for i in matrices_list:
    locals()[i] = locals()[i].fillna(0)

# 하나만 확인
ratings_facility_matrix

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# 아이템-사용자 행렬로 transpose 한다.
for i in matrices_list:
    locals()[i+'_T'] = locals()[i].T

# 하나만 확인
ratings_price_matrix_T

id_num,1,2,3,4,5,6,7,8,9,10,...,6099,6100,6101,6102,6103,6104,6105,6106,6107,6108
cc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
360도,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H1 CLUB(구 덕평),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가산노블리스,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가평베네스트,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
한양파인(P9),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
한원,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
해솔리아,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
화성(P9),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 코사인 유사도

In [50]:
# 카테고리 별 골프장들의 코사인 유사도 산출
for i in range(len(feature_list)):
    locals()['item_sim_'+feature_list[i]] = cosine_similarity(locals()[matrices_list[i]+'_T'], locals()[matrices_list[i]+'_T'])
    locals()['item_sim_'+feature_list[i]+'_df'] = pd.DataFrame(data=locals()['item_sim_'+feature_list[i]], index=locals()[matrices_list[i]].columns, columns=locals()[matrices_list[i]].columns)

# 하나만 확인
item_sim_cc_df

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
cc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
360도,1.000000,0.129687,0.082993,0.0,0.0,0.080971,0.0,0.076143,0.090617,0.051383,...,0.120631,0.110415,0.021936,0.036654,0.019916,0.014859,0.062807,0.069845,0.000000,0.021751
88,0.129687,1.000000,0.122176,0.0,0.0,0.169984,0.0,0.045810,0.157761,0.090107,...,0.062318,0.140730,0.041519,0.158343,0.029805,0.048273,0.054948,0.031612,0.026383,0.045506
H1 CLUB(구 덕평),0.082993,0.122176,1.000000,0.0,0.0,0.105661,0.0,0.096778,0.121914,0.105270,...,0.085277,0.091027,0.054037,0.121180,0.043032,0.065602,0.054267,0.025507,0.037024,0.015205
가산노블리스,0.000000,0.000000,0.000000,1.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
가평베네스트,0.000000,0.000000,0.000000,0.0,1.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.076402,0.040577,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
한양파인(P9),0.014859,0.048273,0.065602,0.0,0.0,0.076820,0.0,0.000000,0.054888,0.032802,...,0.057544,0.046259,0.038333,0.014473,0.026326,1.000000,0.063062,0.000000,0.015925,0.041807
한원,0.062807,0.054948,0.054267,0.0,0.0,0.065480,0.0,0.000000,0.022148,0.042085,...,0.019738,0.029165,0.101461,0.058668,0.000000,0.063062,1.000000,0.043069,0.055504,0.000000
해솔리아,0.069845,0.031612,0.025507,0.0,0.0,0.040993,0.0,0.000000,0.089218,0.059541,...,0.027695,0.042083,0.044180,0.052363,0.000000,0.000000,0.043069,1.000000,0.020826,0.000000
화성(P9),0.000000,0.026383,0.037024,0.0,0.0,0.000000,0.0,0.000000,0.045073,0.033961,...,0.035454,0.053755,0.014254,0.024154,0.049247,0.015925,0.055504,0.020826,1.000000,0.000000


In [51]:
# 카테고리 별 가평베네스트와 유사한 골프장 5개 확인해보기
for i in feature_list:
    print(f'{i} 평점 기준 유사 골프장 5개')
    print(locals()['item_sim_'+i+'_df']['가평베네스트'].sort_values(ascending=False)[1:6])
    print(" ")

cc 평점 기준 유사 골프장 5개
cc_name
파인크리크      0.196455
일동레이크      0.168177
프리스틴밸리     0.085060
사우스스프링스    0.084510
자유         0.082303
Name: 가평베네스트, dtype: float64
 
caddie 평점 기준 유사 골프장 5개
cc_name
파인크리크      0.205499
일동레이크      0.194331
사우스스프링스    0.092324
크리스탈밸리     0.087407
프리스틴밸리     0.087065
Name: 가평베네스트, dtype: float64
 
course 평점 기준 유사 골프장 5개
cc_name
파인크리크      0.217597
일동레이크      0.152388
사우스스프링스    0.090699
자유         0.089586
프리스틴밸리     0.086920
Name: 가평베네스트, dtype: float64
 
price 평점 기준 유사 골프장 5개
cc_name
파인크리크     0.170716
일동레이크     0.164122
자유        0.090551
크리스탈밸리    0.086915
필로스       0.084335
Name: 가평베네스트, dtype: float64
 
facility 평점 기준 유사 골프장 5개
cc_name
파인크리크      0.173913
일동레이크      0.159490
프리스틴밸리     0.090819
포레스트힐      0.085145
사우스스프링스    0.075224
Name: 가평베네스트, dtype: float64
 


### 개인화 된 골프장 추천

In [52]:
# 평점 벡터(행 벡터)와 유사도 벡터(열 벡터)를 내적(dot)해서 예측 평점을 계산하는 함수 정의
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

In [53]:
# 카테고리 별 예측 평점 데이터프레임
for i in range(len(feature_list)):
    locals()['ratings_pred_'+feature_list[i]] = predict_rating(locals()[matrices_list[i]].to_numpy(), locals()['item_sim_'+feature_list[i]+'_df'].to_numpy())
    locals()['ratings_pred_'+feature_list[i]+'_matrix'] = pd.DataFrame(data=locals()['ratings_pred_'+feature_list[i]], index=locals()[matrices_list[i]].index, columns=locals()[matrices_list[i]].columns)

In [54]:
# 캐디 평점 예측치 데이터프레임 확인
ratings_pred_cc_matrix

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.129753,0.155174,0.096732,0.0,0.193268,0.137109,0.000000,0.000000,0.101496,0.115541,...,0.103214,0.123184,0.083964,0.137698,0.175348,0.133044,0.087886,0.112034,0.097454,0.180592
2,0.273402,0.288975,0.249187,0.0,0.117313,0.289086,0.000000,0.177139,0.298910,0.303644,...,0.205106,0.276293,0.188097,0.302535,0.145902,0.158580,0.179162,0.302249,0.091605,0.166628
3,0.118708,0.178084,0.153067,0.0,0.000000,0.148182,0.000000,0.000000,0.202943,1.408832,...,0.091178,0.176498,0.124077,0.190736,0.180576,0.148093,0.164831,0.170387,0.100865,0.126565
4,0.157735,0.095144,0.115599,0.0,0.000000,0.089534,0.000000,0.000000,0.085504,0.094857,...,0.090852,0.103596,0.094991,0.086250,0.012522,0.086475,0.124384,0.127308,0.049122,0.000000
5,0.045210,0.053243,0.055552,0.0,0.067360,0.057547,0.000000,0.000000,0.047016,0.041025,...,0.081630,0.089102,0.043136,0.048205,0.072745,0.043132,0.032241,0.026075,0.016373,0.014688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.102331,0.144001,0.131626,0.0,0.000000,0.144823,0.000000,0.112589,0.943493,0.145724,...,0.078703,0.118665,0.131553,0.104202,0.126622,0.096852,0.038512,0.164615,0.083152,0.030743
6105,0.138493,0.099306,0.098242,0.0,0.000000,0.097620,0.000000,0.000000,0.113463,0.073844,...,0.121795,0.121208,0.118224,0.083922,0.026327,0.032759,0.065783,0.124298,0.097431,0.087771
6106,0.139443,0.222661,0.192504,0.0,0.210988,0.233638,0.000000,0.000000,0.179400,0.133683,...,0.142372,0.217947,0.139678,0.277398,0.093327,0.132528,0.203598,0.144194,0.152469,0.136205
6107,0.227075,0.262199,0.178744,0.0,0.461891,0.266031,0.000000,0.172572,0.205886,0.197118,...,0.238012,0.250023,0.143796,0.226249,0.019536,0.185167,0.076403,0.163506,0.076611,0.220334


In [55]:
# 사용자가 평점을 부여한 골프장에 대해서만 예측 성능 평가 MSE 를 구함. 
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [56]:
# 카테고리 별 평점 기반 모든 인접 이웃 MSE
for i in range(len(feature_list)):
    print(f'{feature_list[i]} 평점 기반 모든 인접 이웃 MSE: ', get_mse(locals()['ratings_pred_'+feature_list[i]], locals()[matrices_list[i]].values))

cc 평점 기반 모든 인접 이웃 MSE:  52.40870864108464
caddie 평점 기반 모든 인접 이웃 MSE:  55.85508809306472
course 평점 기반 모든 인접 이웃 MSE:  53.111744878322725
price 평점 기반 모든 인접 이웃 MSE:  53.26003655418972
facility 평점 기반 모든 인접 이웃 MSE:  51.196772438677286


### top-n 유사도를 가진 데이터들에 대해서만 예측 평점 계산
- MSE 값을 감소하기 위해 특정 골프장과 가장 비슷한 유사도를 가지는 골프장에 대해서만 유사도 벡터를 적용

In [57]:
# 함수 생성
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)

    # 사용자-아이템 평점 행렬의 열 크기만큼 Loop 수행. 
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 index 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T) 
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))        
    return pred

In [58]:
# 계산 후 예측 평점 데이터프레임 재생성
for i in range(len(feature_list)):
    locals()['ratings_pred_'+feature_list[i]] = predict_rating_topsim(locals()[matrices_list[i]].to_numpy(), locals()['item_sim_'+feature_list[i]+'_df'].to_numpy(), n=10)
    print(f'{feature_list[i]} 평점 기반 인접 TOP-10 이웃 MSE: ', get_mse(locals()['ratings_pred_'+feature_list[i]], locals()[matrices_list[i]].values))
    locals()['ratings_pred_'+feature_list[i]+'_matrix'] = pd.DataFrame(data=locals()['ratings_pred_'+feature_list[i]], index=locals()[matrices_list[i]].index, columns=locals()[matrices_list[i]].columns)

cc 평점 기반 인접 TOP-10 이웃 MSE:  19.371875247435657
caddie 평점 기반 인접 TOP-10 이웃 MSE:  20.588028134762485
course 평점 기반 인접 TOP-10 이웃 MSE:  19.610456783625402
price 평점 기반 인접 TOP-10 이웃 MSE:  19.61020075127176
facility 평점 기반 인접 TOP-10 이웃 MSE:  18.95360940815949


### 6번 사용자가 평가하지 않은(미방문) 골프장 중에서 추천해보자
- user_rating이 0보다 크면 기존에 방문한 골프장이라는 점을 이용해서 계산

In [59]:
# 추천에 앞서 6번 사용자가 높은 평점을 준 TOP-10 골프장을 확인
user_rating_id = ratings_cc_matrix.loc[6, :]
user_rating_id[ user_rating_id > 0].sort_values(ascending=False)[:10]

cc_name
포천힐스     10.0
여주신라     10.0
양지파인     10.0
솔모로      10.0
스카이밸리     9.5
필로스       9.0
아시아나      9.0
서원힐스      9.0
안성        8.5
써닝포인트     8.5
Name: 6, dtype: float64

In [60]:
# 평가하지 않은 골프장 리스트 함수 생성
def get_unseen_cc(ratings_matrix, id_num):
    # id_num으로 입력받은 사용자의 모든 골프장 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 골프장명(cc_name)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[id_num,:]
    
    # user_rating이 0보다 크면 기존에 방문한 골프장. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 골프장을 list 객체로 만듬.
    cc_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 cc는 cc_list에서 제외함. 
    unseen_list = [cc for cc in cc_list if cc not in already_seen]
    
    return unseen_list

In [61]:
# pred_df : 앞서 계산된 골프장별 예측 평점
# unseen_list : 사용자가 방문하지 않은 골프장
# top_n : 상위 n개를 가져온다.

def recomm_cc_by_id(pred_df, id_num, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 골프장명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_cc = pred_df.loc[id_num, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_cc

In [62]:
# 6번 사용자의 미평가 골프장의 각 카테고리 별 예측 평점 상위 TOP-5
golfer_id = 6

for i in range(len(feature_list)):
    # 사용자가 방문하지 않은 골프장 추출   
    locals()['unseen_list_by_'+feature_list[i]] = get_unseen_cc(locals()[matrices_list[i]], golfer_id)

    # 아이템 기반의 인접 이웃 협업 필터링으로 골프장 추천 
    locals()['recomm_10_by_'+feature_list[i]] = recomm_cc_by_id(locals()['ratings_pred_'+feature_list[i]+'_matrix'], 2, locals()['unseen_list_by_'+feature_list[i]], top_n=5)

    # 평점 데이타를 DataFrame으로 생성. 
    locals()['recomm_10_by_'+feature_list[i]] = pd.DataFrame(data=locals()['recomm_10_by_'+feature_list[i]].values, index=locals()['recomm_10_by_'+feature_list[i]].index, columns=['pred_score'])

    print(f'{golfer_id}번 골퍼의 미방문 골프장 중 {feature_list[i]} 예측 평점 TOP-5')
    display(locals()['recomm_10_by_'+feature_list[i]])

6번 골퍼의 미방문 골프장 중 cc 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
아리지,5.045687
그린힐,4.216246
금강,2.858614
여주,1.14924
에덴블루,1.110985


6번 골퍼의 미방문 골프장 중 caddie 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
그린힐,5.489473
아리지,5.140176
금강,2.863936
여주,1.143462
리베라,1.040474


6번 골퍼의 미방문 골프장 중 course 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
아리지,5.165095
금강,3.606794
그린힐,3.596817
여주,1.289407
에덴블루,1.240755


6번 골퍼의 미방문 골프장 중 price 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
아리지,4.928836
그린힐,4.409854
금강,2.168203
여주,0.995436
에덴블루,0.977433


6번 골퍼의 미방문 골프장 중 facility 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
아리지,5.004597
그린힐,3.466923
금강,2.844712
에덴블루,1.150482
여주,1.148697


### 개인화 추천 - 계산 방식이 다른 버전

In [63]:
# cc: 137개, user: 6108개
# user id_num을 계산에 반영한다
userId_grouped = ratings_cc.groupby('id_num')
# index: id_num, columns: total cc_num
for i in feature_list:
    locals()[f'item_prediction_result_{i}'] = pd.DataFrame(index=list(userId_grouped.indices.keys()), columns=locals()[f'ratings_{i}_matrix_T'].index)

# 하나만 확인
item_prediction_result_cc

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [64]:
# 다른 방식의 계산 - matmul
for userId, group in tqdm(userId_grouped):
    for i in feature_list:
        # user가 rating한 cc_name * 전체 cc_name
        locals()[f'user_sim_{i}'] = locals()[f'item_sim_{i}_df'].loc[group['cc_name']]
        # user가 rating한 cc_name * 1
        locals()[f'user_rating_{i}'] = group['cc_score']
        # 전체 cc_name * 1
        locals()[f'sim_sum_{i}'] = locals()[f'user_sim_{i}'].sum(axis=0)

        # userId의 전체 rating predictions (6108 * 1)
        locals()[f'pred_ratings_{i}'] = np.matmul(locals()[f'user_sim_{i}'].T.to_numpy(), locals()[f'user_rating_{i}']) / (locals()[f'sim_sum_{i}']+1)
        locals()[f'item_prediction_result_{i}'].loc[userId] = locals()[f'pred_ratings_{i}']

  0%|          | 0/6108 [00:00<?, ?it/s]

In [65]:
# 카테고리 별 평점 기반 모든 인접 이웃 MSE
for i in feature_list:
    print(f'{i} 평점 기반 모든 인접 이웃 MSE: ', get_mse(locals()[f'item_prediction_result_{i}'].values, locals()[f'ratings_{i}_matrix'].values))

cc 평점 기반 모든 인접 이웃 MSE:  11.148817119642583
caddie 평점 기반 모든 인접 이웃 MSE:  13.967683973480248
course 평점 기반 모든 인접 이웃 MSE:  12.27546976594942
price 평점 기반 모든 인접 이웃 MSE:  12.38460320949954
facility 평점 기반 모든 인접 이웃 MSE:  11.20463520617389


In [67]:
# TOP-5 유사도 가진 데이터들에 대한 예측 평점 계산 및 데이터프레임 재생성
for i in feature_list:
    locals()[f'item_rating_prediction_by_{i}'] = predict_rating_topsim(locals()[f'ratings_{i}_matrix'].values, locals()[f'item_sim_{i}_df'].values, n=5)
    print(f'{i} 평점 기반 인접 TOP-5 이웃 MSE: ', get_mse(locals()[f'item_rating_prediction_by_{i}'], locals()[f'ratings_{i}_matrix'].values))
    locals()[f'ratings_pred_{i}_matrix'] = pd.DataFrame(data=locals()[f'item_rating_prediction_by_{i}'], index=locals()[f'ratings_{i}_matrix'].index, columns=locals()[f'ratings_{i}_matrix'].columns)

cc 평점 기반 인접 TOP-5 이웃 MSE:  9.571428025842971
caddie 평점 기반 인접 TOP-5 이웃 MSE:  10.134930312550942
course 평점 기반 인접 TOP-5 이웃 MSE:  9.69295651949642
price 평점 기반 인접 TOP-5 이웃 MSE:  9.654432605024411
facility 평점 기반 인접 TOP-5 이웃 MSE:  9.370221050499088


In [68]:
# 6번 사용자의 미평가 골프장의 각 카테고리 별 예측 평점 상위 TOP-5
golfer_id = 6

for i in feature_list:
    # 사용자가 방문하지 않은 골프장 추출   
    locals()[f'item_unseen_list_by_{i}'] = get_unseen_cc(locals()[f'ratings_{i}_matrix'], golfer_id)

    # 아이템 기반의 인접 이웃 협업 필터링으로 골프장 추천 
    locals()[f'item_recomm_10_by_{i}'] = recomm_cc_by_id(locals()[f'item_prediction_result_{i}'], golfer_id, locals()[f'item_unseen_list_by_{i}'], top_n=5)

    # 평점 데이타를 DataFrame으로 생성. 
    locals()[f'item_recomm_10_by_{i}'] = pd.DataFrame(locals()[f'item_recomm_10_by_{i}'].values, index=locals()[f'item_recomm_10_by_{i}'].index, columns=['pred_score'])

    print(f'{golfer_id}번 골퍼의 미방문 골프장 중 {i} 예측 평점 TOP-5')
    display(locals()[f'item_recomm_10_by_{i}'])

6번 골퍼의 미방문 골프장 중 cc 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
금강,6.5462
은화삼,6.489207
88,6.408399
한림광릉,6.351856
레이크사이드,6.279913


6번 골퍼의 미방문 골프장 중 caddie 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
금강,6.538885
은화삼,6.476422
88,6.404594
한림광릉,6.33263
레이크사이드,6.269084


6번 골퍼의 미방문 골프장 중 course 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
금강,6.528973
은화삼,6.476993
88,6.401564
한림광릉,6.320089
레이크사이드,6.264371


6번 골퍼의 미방문 골프장 중 price 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
금강,6.524416
은화삼,6.475107
88,6.39188
한림광릉,6.306964
레이크사이드,6.267108


6번 골퍼의 미방문 골프장 중 facility 예측 평점 TOP-5


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
금강,6.551091
은화삼,6.481905
88,6.408392
한림광릉,6.367221
레이크사이드,6.269826
