In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
path = './data/'
data = pd.read_csv(os.path.join(path, 'xgolf_contents_final.csv'), encoding='utf-8', index_col=0)

data # 데이터 확인

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
1,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장
2,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...
3,몽베르,경기북부의 몽베르,NV2*******,2022.07.23 10:04,2022,7,23,10:04,10.0,10,10,10,10,편안하고 관리가 잘된 골프코스에서 라운딩했네요
4,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...
5,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47860,기흥,좋은골프장 좋은그린,NV5******,2018.01.01 18:46,2018,1,1,18:46,10.0,10,10,10,10,겨울에도 기흥은 그린이 좋아요.속도도 맘에듭니다
47861,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다
47862,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...
47863,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...


In [3]:
data.info() # 개괄 확인

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47828 entries, 1 to 47864
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   golf_name    47828 non-null  object 
 1   title        47828 non-null  object 
 2   id           47828 non-null  object 
 3   datetime     47828 non-null  object 
 4   year         47828 non-null  int64  
 5   month        47828 non-null  int64  
 6   day          47828 non-null  int64  
 7   time         47828 non-null  object 
 8   golfscore    47828 non-null  float64
 9   caddiescore  47828 non-null  int64  
 10  coursescore  47828 non-null  int64  
 11  pricescore   47828 non-null  int64  
 12  facility     47828 non-null  int64  
 13  contents     47828 non-null  object 
dtypes: float64(1), int64(7), object(6)
memory usage: 5.5+ MB


### 데이터 전처리

In [4]:
# 중복 행 확인
data[(data.duplicated())]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
82,그린힐,최고의 캐디분을 만난 날,don*******,2022.07.04 12:26,2022,7,4,12:26,8.5,10,8,10,6,폭염으로 아주 힘든 라운딩였지만 그래도 캐디님의 밝은 웃음과 성실한 태도에 정말 감...
303,샴발라,중간이상,kaa***,2022.05.26 09:15,2022,5,26,09:15,6.5,10,8,4,4,코스가 어렵지만 재미있었어요
1422,서원힐스,겨울골프,kin****,2022.02.04 09:44,2022,2,4,09:44,9.0,10,10,8,8,올해 첫 라운딩. 부킹할 때만해도 조금은 덜 추우리라 예상했지만 역시 겨울은 겨울....
2385,스카이밸리,？？,yh0***,2021.09.12 11:05,2021,9,12,11:05,9.5,10,8,10,10,코스 관리 최고 그린 관리가 조금 아쉽네요 ??
3091,강남300,다 좋았네요,KK1*********,2021.07.31 13:59,2021,7,31,13:59,10.0,10,10,10,10,가격대비 관리도 잘 되어있고 ??
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47860,기흥,좋은골프장 좋은그린,NV5******,2018.01.01 18:46,2018,1,1,18:46,10.0,10,10,10,10,겨울에도 기흥은 그린이 좋아요.속도도 맘에듭니다
47861,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다
47862,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...
47863,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...


In [5]:
# 중복 행 (18073) 제거
data = data.drop_duplicates()
data

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
1,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장
2,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...
3,몽베르,경기북부의 몽베르,NV2*******,2022.07.23 10:04,2022,7,23,10:04,10.0,10,10,10,10,편안하고 관리가 잘된 골프코스에서 라운딩했네요
4,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...
5,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38084,기흥,좋은골프장 좋은그린,NV5******,2018.01.01 18:46,2018,1,1,18:46,10.0,10,10,10,10,겨울에도 기흥은 그린이 좋아요.속도도 맘에듭니다
38085,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다
38086,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...
38087,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...


In [6]:
# 같은 아이디로 같은 제목과 내용의 글을 올린 경우 있는지 확인
data_dup = data[(data.duplicated(['id', 'title', 'contents']))]
data_dup

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
81,그린힐,최고의 캐디분을 만난 날,don*******,2022.07.04 12:26,2022,7,4,12:26,8.5,10,8,10,6,폭염으로 아주 힘든 라운딩였지만 그래도 캐디님의 밝은 웃음과 성실한 태도에 정말 감...
7245,레이크사이드,좋아요,kei*****,2020.10.04 11:22,2020,10,4,11:22,10.0,10,10,10,10,역시 레이크사이드♥?
15928,자유,자유cc 후기,khl****,2019.03.07 21:12,2019,3,7,21:12,9.0,10,8,10,8,평일오후 선결제로 저렴하게 다녀왔어요.\n겨울이 끝나가는데 날씨도 좋고 밀리지도 않...
16035,레이크힐스안성(P9),가성비굿,kks*****,2019.03.03 20:28,2019,3,3,20:28,10.0,10,10,10,10,가격대비 아주좋습니다
16772,88,88,act*****,2018.12.17 15:07,2018,12,17,15:07,10.0,10,10,10,10,너무 좋아요 최고에요
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37022,아일랜드,잘치고ㅡ,lhy****,2018.03.06 11:11,2018,3,6,11:11,10.0,10,10,10,10,날씨도넘좋았고ㅡ동반자들도좋아했다ㅡ\n나중에또오고십다는말을
37216,블루헤런,블루헤런을 다녀와서~^^,oza**,2018.03.02 11:11,2018,3,2,11:11,10.0,10,10,10,10,가성비날씨그리고 캐디까지 친절해서 즐거운 롼딩이였네요 ^^
37266,써닝포인트,최악이었네요,NV4*******,2018.03.01 10:06,2018,3,1,10:06,7.5,8,6,8,8,캐디가 착해서 그냥 재밌게치고 오자했지만\n시작부터 티업시간도 되지 않았는데 짜증나...
37276,남서울,코스관리 잘 되어있는 골프장,plo***,2018.02.28 22:25,2018,2,28,22:25,10.0,10,10,10,10,캐디도 좋았고 코스도 좋았습니딘


In [7]:
# 위 경우 (103) 리뷰 신뢰도가 낮을것으로 판단 => 삭제
data = data.drop_duplicates(['id', 'title', 'contents'])
data.shape

(29652, 14)

In [8]:
# 중복 행 확인
data[(data.duplicated())]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents


In [9]:
data['id'].nunique() # unique id 개수 확인

6134

In [10]:
data['id'].value_counts() # id별 리뷰 개수 확인(소셜로그인 계정 - NV: Naver, KK: Kakao)

NV2*******      889
KK1*********    808
NV3*******      685
NV1*******      549
NV4*******      479
               ... 
Woo***            1
skk******         1
nk8***            1
hwl****           1
pac*****          1
Name: id, Length: 6134, dtype: int64

In [11]:
# 네이버 계정 사용 리뷰 확인
data[data['id'].str.contains("NV", case=True)]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
3,몽베르,경기북부의 몽베르,NV2*******,2022.07.23 10:04,2022,7,23,10:04,10.0,10,10,10,10,편안하고 관리가 잘된 골프코스에서 라운딩했네요
15,서서울,언제나좋은 서서울,NV4*******,2022.07.20 11:19,2022,7,20,11:19,9.5,10,10,10,8,음식가격비싼거만빼면 너무좋은 코스관리등
19,크리스탈밸리,칭찬합니다.,NV9*******,2022.07.19 15:43,2022,7,19,15:43,8.5,10,8,8,8,더운 날씨\n짜증 한 번 없이 열심히 해 준\n캐디 ?혜원 양 칭찬합니다.
22,360도,귿,NV5*******,2022.07.18 11:10,2022,7,18,11:10,10.0,10,10,10,10,귿 또 방문할 예정입니다!
23,서서울,서서울cc,NV1*******,2022.07.18 11:08,2022,7,18,11:08,7.5,6,10,8,6,그린피가 비싼만큼 코스관리는 훌륭했습니다\n다만 캐디님이 신입이셔서.... 캐디피가...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38056,레이크사이드,겨울 추운 날,NV_********,2018.01.03 16:38,2018,1,3,16:38,9.0,6,10,10,10,코스는 얼었어도 재미있음 영상기온을 기다리며~~~
38057,기흥,기흥cc 후기,NV2*******,2018.01.03 15:28,2018,1,3,15:28,8.0,8,8,8,8,12월 30일에 갔는데 날씨도 따뜻해서 좋았습니다\n그린이 얼어서 볼이 튀기도 하고...
38082,솔모로,2017마지막 라운딩,NV2*******,2018.01.01 21:20,2018,1,1,21:20,10.0,10,10,10,10,12/29일 진행한 2인 라운딩\n그주에 가장 따뜻한 하루여서 아주아주 행복하게 게...
38083,은화삼,겨울골프 중 최고,NV8******,2018.01.01 20:24,2018,1,1,20:24,10.0,10,10,10,10,최고의 날씨에서 즐거운 란딩했어요


In [12]:
# 카카오 계정 사용 리뷰 확인
data[data['id'].str.contains("KK", case=True)]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
8,이포,이포CC 좋아요,KK2*********,2022.07.21 13:06,2022,7,21,13:06,10.0,10,10,10,10,아늑하고 편안한 라운딩 되었어요\n\n재방문 의사 있어요
17,포레스트힐(P6),연습 겸 2인 라운딩,KK1*********,2022.07.19 15:57,2022,7,19,15:57,10.0,10,10,10,10,노캐디\n연습 2인 라운딩 잘 다녀왔습니다\n가깝고 저렴하고 좋아요
26,중부,"필드는 명문, 캐디는 최악",KK1*********,2022.07.18 07:24,2022,7,18,07:24,8.0,2,10,10,10,역시나 필드 관리나 클럽하우스 시설은 최고로 좋았지만..\n캐디 때문에 완전 기분 ...
27,이포,최악 이포,KK1*********,2022.07.17 14:36,2022,7,17,14:36,4.0,2,8,4,2,"장점 : 근거리, 넓은 페어웨이, 잔디 상태\n단점 : 최악 캐디(고령, 불친절, ..."
29,해솔리아,굿,KK2*********,2022.07.17 11:21,2022,7,17,11:21,10.0,10,10,10,10,인터넷 이용자 댓글보고 기대도 안했는데 아두 만족함
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10263,한림광릉,굿..,KK1*********,2020.04.16 11:27,2020,4,16,11:27,10.0,10,10,10,10,부킹을쉽게할수있어서조앗구\n시설두아주깔끔하구잔디상태완전조앗습니다.
10612,크리스탈밸리,날씨도 좋고 기분좋게 다녀왓어요~!!,KK1*********,2020.03.24 18:25,2020,3,24,18:25,8.5,8,8,8,10,크리스탈밸리 너무 기분좋게 다녀왔어요ㅎㅎ\n날씨도 좋고 클럽하우스 음식도 너무 맛있...
11310,서원힐스,서원 야간 다녀왔습니다~^^,KK1*********,2019.11.21 00:00,2019,11,21,00:00,9.0,8,10,8,10,야간 라운딩 했어요~\n날씨도 좋고 코스도 좋고 즐거운 란딩했습니다~
13244,서원힐스,골뱅이 무침이 짱이에요~,KK1*********,2019.07.11 00:00,2019,7,11,00:00,9.5,8,10,10,10,그늘집에서 골뱅이 무침이랑 막걸리 한잔하면 크~ 홀인원 할수있을 정도로 최곱니다 ㅎㅎ


In [13]:
# 소셜로그인 계정 (4010 + 848) 리뷰 삭제(과도한 중복 피하기) => 행 개수: 29652 - (4010 + 848) = 24794
data = data[~((data['id'].str.contains('NV', case=True)) | (data['id'].str.contains('KK', case=True)))]
data.shape

(24794, 14)

In [14]:
data[(data.duplicated())]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents


In [15]:
# Base DF 생성
data.rename(columns={'golf_name':'cc_name', 'golfscore':'cc_score', 'caddiescore':'caddie', 'coursescore':'course', 'pricescore': 'price'}, inplace=True)
print(data.shape)
data.head()

(24794, 14)


Unnamed: 0,cc_name,title,id,datetime,year,month,day,time,cc_score,caddie,course,price,facility,contents
1,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장
2,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...
4,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...
5,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...
6,썬힐,정말 해도해도 너무하네요,cli*****,2022.07.22 09:30,2022,7,22,09:30,3.5,8,2,2,2,좋은 날 기분 더럽게 다녀왔네요.. 코스관리 개엉망이고 그린은 완전 걸레짝. 카운터...


### 사용자 데이터프레임 생성

In [16]:
# 전처리 후 unique id 개수 확인
data['id'].nunique()

6111

In [17]:
# 사용자 일련번호를 부여한 USER DF 생성
data_id = data['id'].sort_values().drop_duplicates().reset_index()
data_id = data_id.rename(columns={'index':'id_num'})
data_id['id_num'] = range(1,len(data_id)+1)
data_id

Unnamed: 0,id_num,id
0,1,002***
1,2,003****
2,3,007***
3,4,007*****
4,5,010*******
...,...,...
6106,6107,zzi****
6107,6108,zzi*********
6108,6109,zzu****
6109,6110,zzz*


### 골프장 데이터프레임 생성

In [18]:
# 골프장 DF 생성
data_cc = data['cc_name'].sort_values().drop_duplicates().reset_index()
data_cc = data_cc.rename(columns={'index':'cc_num'})
data_cc['cc_num'] = range(1,len(data_cc)+1)
data_cc

Unnamed: 0,cc_num,cc_name
0,1,360도
1,2,88
2,3,H1 CLUB(구 덕평)
3,4,XGOLF
4,5,XGOLF(테스트)
...,...,...
134,135,한양파인(P9)
135,136,한원
136,137,해솔리아
137,138,화성(P9)


In [19]:
data_cc.head(10)

Unnamed: 0,cc_num,cc_name
0,1,360도
1,2,88
2,3,H1 CLUB(구 덕평)
3,4,XGOLF
4,5,XGOLF(테스트)
5,6,가산노블리스
6,7,가평베네스트
7,8,강남300
8,9,고양컨트리클럽(P9)
9,10,곤지암


In [20]:
# XGOLF는 골프장이 아닌 테스트용 인것으로 보임
data_test = data[(data['cc_name'].str.contains('XGOLF'))]
data_test.shape

(27, 14)

In [21]:
# 추가 전처리 필요... XGOLF 삭제하여 Base DF 변경 -> 행 개수: 24794 - 27 = 24767
data = data[~(data['cc_name'].str.contains('XGOLF'))]
data.shape

(24767, 14)

In [22]:
# Base DF 변경에 따른 USER DF 재생성
data_id = data['id'].sort_values().drop_duplicates().reset_index()
data_id = data_id.rename(columns={'index':'id_num'})
data_id['id_num'] = range(1,len(data_id)+1)
data_id

Unnamed: 0,id_num,id
0,1,002***
1,2,003****
2,3,007***
3,4,007*****
4,5,010*******
...,...,...
6103,6104,zzi****
6104,6105,zzi*********
6105,6106,zzu****
6106,6107,zzz*


In [23]:
# id 정보 csv 추출
# data_id.to_csv('./data/xgolf_user_completed.csv')

In [24]:
# Base DF 변경에 따른 골프장 DF 재생성
data_cc = data['cc_name'].sort_values().drop_duplicates().reset_index()
data_cc = data_cc.rename(columns={'index':'cc_num'})
data_cc['cc_num'] = range(1,len(data_cc)+1)
data_cc

Unnamed: 0,cc_num,cc_name
0,1,360도
1,2,88
2,3,H1 CLUB(구 덕평)
3,4,가산노블리스
4,5,가평베네스트
...,...,...
132,133,한양파인(P9)
133,134,한원
134,135,해솔리아
135,136,화성(P9)


In [25]:
# 골프장 정보 csv 추출
# data_cc.to_csv('./data/xgolf_cc_completed.csv')

In [26]:
# Base DF에 id_num 및 cc_num 삽입
data = pd.merge(data, data_id, on = 'id', how = 'left')
data = pd.merge(data, data_cc, on = 'cc_name', how = 'left')
data

Unnamed: 0,cc_name,title,id,datetime,year,month,day,time,cc_score,caddie,course,price,facility,contents,id_num,cc_num
0,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장,6029,15
1,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...,186,53
2,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...,460,68
3,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...,3498,44
4,썬힐,정말 해도해도 너무하네요,cli*****,2022.07.22 09:30,2022,7,22,09:30,3.5,8,2,2,2,좋은 날 기분 더럽게 다녀왔네요.. 코스관리 개엉망이고 그린은 완전 걸레짝. 카운터...,887,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24762,스카이밸리,마운틴코스라기에. . .,ana*****,2018.01.01 21:50,2018,1,1,21:50,9.5,10,8,10,10,눈이라도 많을까봐 걱정했는데.\n플레이하기에 전혀 지장이 없는\n상태로 코스는 잘 ...,333,68
24763,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다,1097,78
24764,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...,3355,16
24765,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...,4386,2


In [27]:
# 데이터 병합 확인
print(data['id_num'].isnull().sum(), data['cc_num'].isnull().sum())

0 0


In [28]:
# 혹시 모를 중복 확인
data[(data.duplicated())]

Unnamed: 0,cc_name,title,id,datetime,year,month,day,time,cc_score,caddie,course,price,facility,contents,id_num,cc_num


In [29]:
# 전처리 끝난 최종 데이터 csv 추출
# data.to_csv('./data/xgolf_contents_completed.csv')

In [30]:
path = './data/'
data = pd.read_csv(os.path.join(path, 'xgolf_contents_completed.csv'), encoding='utf-8', index_col=0)
data_id = pd.read_csv(os.path.join(path, 'xgolf_user_completed.csv'), encoding='utf-8', index_col=0)
data_cc = pd.read_csv(os.path.join(path, 'xgolf_cc_completed.csv'), encoding='utf-8', index_col=0)

## CF - KNN

In [31]:
# 우선 필요한 데이터만 발췌
ratings_cc_df = data[['id_num', 'cc_num', 'cc_score']]
ratings_cc_df

Unnamed: 0,id_num,cc_num,cc_score
0,6029,15,10.0
1,186,53,10.0
2,460,68,10.0
3,3498,44,10.0
4,887,73,3.5
...,...,...,...
24762,333,68,9.5
24763,1097,78,10.0
24764,3355,16,9.5
24765,4386,2,9.0


In [32]:
# null 체크
ratings_cc_df.isnull().sum()

id_num      0
cc_num      0
cc_score    0
dtype: int64

In [131]:
# title 컬럼을 얻기 이해 movies 와 조인 수행
rating_cc_info = pd.merge(ratings_cc_df, data_cc, on='cc_num')
rating_cc_info

Unnamed: 0,id_num,cc_num,cc_score,cc_name
0,6029,15,10.0,금강
1,5049,15,8.0,금강
2,2005,15,10.0,금강
3,5019,15,8.5,금강
4,5036,15,10.0,금강
...,...,...,...,...
24762,230,59,10.0,서원밸리
24763,2974,115,10.0,파인비치
24764,230,8,10.0,곤지암
24765,230,18,10.0,남부


In [132]:
# pivot_table 메소드를 사용해서 행렬 변환
r_matrix = ratings_cc_df.pivot_table('cc_score', index='id_num', columns='cc_num', aggfunc=max)

print(r_matrix.shape)
r_matrix

(6108, 137)


cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,9.5,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,10.0,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [133]:
# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_cc_info.pivot_table('cc_score', index='id_num', columns='cc_name', aggfunc=max)
ratings_matrix

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,9.5,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,10.0,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [134]:
# NaN 값을 모두 0 으로 변환
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
# 아이템-사용자 행렬로 transpose 한다.
ratings_matrix_T = ratings_matrix.transpose()    # 전치 행렬

print(ratings_matrix_T.shape)
ratings_matrix_T.head(5)

(137, 6108)


id_num,1,2,3,4,5,6,7,8,9,10,...,6099,6100,6101,6102,6103,6104,6105,6106,6107,6108
cc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
360도,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H1 CLUB(구 덕평),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가산노블리스,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가평베네스트,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 코사인 유사도

In [151]:
# 골프장들 간 코사인 유사도 산출
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

# cosine_similarity()로 반환된 넘파이 행렬을 골프장에 매핑하여 DataFrame으로 변환
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,
                          columns=ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df

(137, 137)


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
cc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
360도,1.000000,0.129687,0.082993,0.0,0.0,0.080971,0.0,0.076143,0.090617,0.051383,...,0.120631,0.110415,0.021936,0.036654,0.019916,0.014859,0.062807,0.069845,0.000000,0.021751
88,0.129687,1.000000,0.122176,0.0,0.0,0.169984,0.0,0.045810,0.157761,0.090107,...,0.062318,0.140730,0.041519,0.158343,0.029805,0.048273,0.054948,0.031612,0.026383,0.045506
H1 CLUB(구 덕평),0.082993,0.122176,1.000000,0.0,0.0,0.105661,0.0,0.096778,0.121914,0.105270,...,0.085277,0.091027,0.054037,0.121180,0.043032,0.065602,0.054267,0.025507,0.037024,0.015205
가산노블리스,0.000000,0.000000,0.000000,1.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
가평베네스트,0.000000,0.000000,0.000000,0.0,1.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.076402,0.040577,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
한양파인(P9),0.014859,0.048273,0.065602,0.0,0.0,0.076820,0.0,0.000000,0.054888,0.032802,...,0.057544,0.046259,0.038333,0.014473,0.026326,1.000000,0.063062,0.000000,0.015925,0.041807
한원,0.062807,0.054948,0.054267,0.0,0.0,0.065480,0.0,0.000000,0.022148,0.042085,...,0.019738,0.029165,0.101461,0.058668,0.000000,0.063062,1.000000,0.043069,0.055504,0.000000
해솔리아,0.069845,0.031612,0.025507,0.0,0.0,0.040993,0.0,0.000000,0.089218,0.059541,...,0.027695,0.042083,0.044180,0.052363,0.000000,0.000000,0.043069,1.000000,0.020826,0.000000
화성(P9),0.000000,0.026383,0.037024,0.0,0.0,0.000000,0.0,0.000000,0.045073,0.033961,...,0.035454,0.053755,0.014254,0.024154,0.049247,0.015925,0.055504,0.020826,1.000000,0.000000


In [137]:
# 가평베네스트와 유사한 골프장 5개 확인해보기
item_sim_df['가평베네스트'].sort_values(ascending=False)[1:6]

cc_name
파인크리크      0.196455
일동레이크      0.168177
프리스틴밸리     0.085060
사우스스프링스    0.084510
자유         0.082303
Name: 가평베네스트, dtype: float64

### 개인화 된 골프장 추천

In [138]:
# 평점 벡터(행 벡터)와 유사도 벡터(열 벡터)를 내적(dot)해서 예측 평점을 계산하는 함수 정의
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_pred = predict_rating(ratings_matrix.values , item_sim_df.values)

# 데이터프레임으로 변환
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)

In [139]:
print(ratings_pred_matrix.shape)
ratings_pred_matrix

(6108, 137)


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.129753,0.155174,0.096732,0.0,0.193268,0.137109,0.000000,0.000000,0.101496,0.115541,...,0.103214,0.123184,0.083964,0.137698,0.175348,0.133044,0.087886,0.112034,0.097454,0.180592
2,0.273402,0.288975,0.249187,0.0,0.117313,0.289086,0.000000,0.177139,0.298910,0.303644,...,0.205106,0.276293,0.188097,0.302535,0.145902,0.158580,0.179162,0.302249,0.091605,0.166628
3,0.118708,0.178084,0.153067,0.0,0.000000,0.148182,0.000000,0.000000,0.202943,1.408832,...,0.091178,0.176498,0.124077,0.190736,0.180576,0.148093,0.164831,0.170387,0.100865,0.126565
4,0.157735,0.095144,0.115599,0.0,0.000000,0.089534,0.000000,0.000000,0.085504,0.094857,...,0.090852,0.103596,0.094991,0.086250,0.012522,0.086475,0.124384,0.127308,0.049122,0.000000
5,0.045210,0.053243,0.055552,0.0,0.067360,0.057547,0.000000,0.000000,0.047016,0.041025,...,0.081630,0.089102,0.043136,0.048205,0.072745,0.043132,0.032241,0.026075,0.016373,0.014688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.102331,0.144001,0.131626,0.0,0.000000,0.144823,0.000000,0.112589,0.943493,0.145724,...,0.078703,0.118665,0.131553,0.104202,0.126622,0.096852,0.038512,0.164615,0.083152,0.030743
6105,0.138493,0.099306,0.098242,0.0,0.000000,0.097620,0.000000,0.000000,0.113463,0.073844,...,0.121795,0.121208,0.118224,0.083922,0.026327,0.032759,0.065783,0.124298,0.097431,0.087771
6106,0.139443,0.222661,0.192504,0.0,0.210988,0.233638,0.000000,0.000000,0.179400,0.133683,...,0.142372,0.217947,0.139678,0.277398,0.093327,0.132528,0.203598,0.144194,0.152469,0.136205
6107,0.227075,0.262199,0.178744,0.0,0.461891,0.266031,0.000000,0.172572,0.205886,0.197118,...,0.238012,0.250023,0.143796,0.226249,0.019536,0.185167,0.076403,0.163506,0.076611,0.220334


In [140]:
# 사용자가 평점을 부여한 골프장에 대해서만 예측 성능 평가 MSE 를 구함. 
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print('아이템 기반 모든 인접 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values))

아이템 기반 모든 인접 이웃 MSE:  52.40870864108464


### top-n 유사도를 가진 데이터들에 대해서만 예측 평점 계산
- MSE 값을 감소하기 위해 특정 골프장과 가장 비슷한 유사도를 가지는 골프장에 대해서만 유사도 벡터를 적용

In [141]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)

    # 사용자-아이템 평점 행렬의 열 크기만큼 Loop 수행. 
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 index 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T) 
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))        
    return pred

In [142]:
# 실행시간 2분 정도 걸림
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df.values, n=20)
print('아이템 기반 인접 TOP-20 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values))

# 계산된 예측 평점 데이터는 DataFrame으로 재생성
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)
ratings_pred_matrix

  pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
  pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-20 이웃 MSE:  30.646651101539355


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.351155,0.375553,0.000000,0.0,0.196122,0.345695,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.346257,0.000000,0.302714,0.268827,0.264415,0.000000,0.000000,0.000000,0.293569
2,0.300329,0.328598,0.305155,0.0,0.119045,0.312877,0.000000,0.186496,0.621781,0.597551,...,0.000000,0.313931,0.000000,0.260141,0.000000,0.000000,0.000000,0.240902,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,3.407351,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.205743
4,0.426883,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.235107,0.273705,0.000000,0.000000
5,0.000000,0.000000,0.151041,0.0,0.068355,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.211216,0.250458,0.000000,0.000000,0.111525,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.000000,0.348511,0.357881,0.0,0.000000,0.365145,0.000000,0.118536,2.480801,0.352443,...,0.000000,0.000000,0.280649,0.000000,0.000000,0.000000,0.000000,0.353914,0.000000,0.000000
6105,0.374807,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.315143,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.267234,0.000000,0.000000
6106,0.000000,0.305243,0.296429,0.0,0.214104,0.364726,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.362889,0.000000,0.609828,0.000000,0.000000,0.242133,0.000000,0.192495,0.000000
6107,0.408681,0.358272,0.296718,0.0,0.335812,0.360766,0.000000,0.181687,0.374305,0.385013,...,0.372018,0.702788,0.000000,0.309077,0.000000,0.210226,0.000000,0.000000,0.000000,0.000000


In [45]:
# 추천에 앞서 2번 사용자가 높은 평점을 준 골프장을 확인
user_rating_id = ratings_matrix.loc[2, :]
user_rating_id[ user_rating_id > 0].sort_values(ascending=False)[:5]

cc_name
아리지    10.0
금강      8.0
그린힐     7.5
Name: 2, dtype: float64

#### 사용자가 방문하지 않은 골프장 중에서 추천해보자
- user_rating이 0보다 크면 기존에 방문한 골프장이라는 점을 이용해서 계산

In [46]:
def get_unseen_cc(ratings_matrix, id_num):
    # id_num으로 입력받은 사용자의 모든 골프장 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 골프장명(cc_name)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[id_num,:]
    
    # user_rating이 0보다 크면 기존에 방문한 골프장. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 골프장을 list 객체로 만듬.
    cc_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 cc는 cc_list에서 제외함. 
    unseen_list = [cc for cc in cc_list if cc not in already_seen]
    
    return unseen_list

In [47]:
# pred_df : 앞서 계산된 골프장별 예측 평점
# unseen_list : 사용자가 방문하지 않은 골프장
# top_n : 상위 n개를 가져온다.

def recomm_cc_by_id(pred_df, id_num, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 골프장명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_cc = pred_df.loc[id_num, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_cc

In [49]:
# 사용자가 방문하지 않은 골프장 추출   
unseen_list = get_unseen_cc(ratings_matrix, 2)

# 아이템 기반의 인접 이웃 협업 필터링으로 골프장 추천 
recomm_10_cc = recomm_cc_by_id(ratings_pred_matrix, 2, unseen_list, top_n=10)

# 평점 데이타를 DataFrame으로 생성. 
recomm_10_cc = pd.DataFrame(data=recomm_10_cc.values, index=recomm_10_cc.index, columns=['pred_score'])
recomm_10_cc

Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
안성,0.793017
여주,0.766046
에덴블루,0.72707
캐슬파인,0.709348
루나힐스안성,0.688947
타이거,0.651284
큐로,0.646774
리베라,0.631008
골드,0.621781
골프존카운티안성H,0.597551


#### LJW style

In [144]:
# train_df, test_df = train_test_split(ratings_cc_df, test_size=0.02, random_state=123)

# print(train_df.shape)
# print(test_df.shape)

sparse_matrix = rating_cc_info.pivot_table('cc_score', index='id_num', columns='cc_num', aggfunc=max)
sparse_matrix

cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,9.5,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,10.0,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [145]:
item_sparse_matrix = sparse_matrix.T.fillna(0)
item_sparse_matrix.shape

(137, 6108)

In [101]:
def cossim_matrix(a, b):
    cossim_values = cosine_similarity(a.values, b.values)
    cossim_df = pd.DataFrame(data=cossim_values, columns = a.index.values, index=a.index)

    return cossim_df

In [118]:
item_cossim_df = cossim_matrix(item_sparse_matrix.T, item_sparse_matrix.T)
item_cossim_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
cc_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.129687,0.082993,0.0,0.0,0.080971,0.0,0.076143,0.090617,0.051383,...,0.120631,0.110415,0.021936,0.036654,0.019916,0.014859,0.062807,0.069845,0.000000,0.021751
2,0.129687,1.000000,0.122176,0.0,0.0,0.169984,0.0,0.045810,0.157761,0.090107,...,0.062318,0.140730,0.041519,0.158343,0.029805,0.048273,0.054948,0.031612,0.026383,0.045506
3,0.082993,0.122176,1.000000,0.0,0.0,0.105661,0.0,0.096778,0.121914,0.105270,...,0.085277,0.091027,0.054037,0.121180,0.043032,0.065602,0.054267,0.025507,0.037024,0.015205
4,0.000000,0.000000,0.000000,1.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.0,1.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.076402,0.040577,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0.014859,0.048273,0.065602,0.0,0.0,0.076820,0.0,0.000000,0.054888,0.032802,...,0.057544,0.046259,0.038333,0.014473,0.026326,1.000000,0.063062,0.000000,0.015925,0.041807
134,0.062807,0.054948,0.054267,0.0,0.0,0.065480,0.0,0.000000,0.022148,0.042085,...,0.019738,0.029165,0.101461,0.058668,0.000000,0.063062,1.000000,0.043069,0.055504,0.000000
135,0.069845,0.031612,0.025507,0.0,0.0,0.040993,0.0,0.000000,0.089218,0.059541,...,0.027695,0.042083,0.044180,0.052363,0.000000,0.000000,0.043069,1.000000,0.020826,0.000000
136,0.000000,0.026383,0.037024,0.0,0.0,0.000000,0.0,0.000000,0.045073,0.033961,...,0.035454,0.053755,0.014254,0.024154,0.049247,0.015925,0.055504,0.020826,1.000000,0.000000


In [117]:
ratings_cc_df.groupby('cc_num').count()

Unnamed: 0_level_0,id_num,cc_score
cc_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,245,245
2,1038,1038
3,187,187
4,1,1
5,1,1
...,...,...
133,38,38
134,42,42
135,109,109
136,49,49


In [121]:
# cc: 137개, user: 6108개
# train_df에 포함된 user id_num을 계산에 반영한다
userId_grouped = ratings_cc_df.groupby('id_num')
# index: id_num, columns: total cc_num
item_prediction_result_df = pd.DataFrame(index=list(userId_grouped.indices.keys()), columns=item_sparse_matrix.index)
item_prediction_result_df

cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [122]:
for userId, group in tqdm(userId_grouped):
    # user가 rating한 cc_num * 전체 cc_num
    user_sim = item_cossim_df.loc[group['cc_num']]
    # user가 rating한 cc_num * 1
    user_rating = group['cc_score']
    # 전체 cc_num * 1
    sim_sum = user_sim.sum(axis=0)

    # userId의 전체 rating predictions (6108 * 1)
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    item_prediction_result_df.loc[userId] = pred_ratings

  0%|          | 0/6108 [00:00<?, ?it/s]

In [123]:
item_prediction_result_df

cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
1,1.030585,1.453006,0.82227,0.0,0.450663,1.169781,0.0,0.0,0.971259,0.797174,...,0.84463,1.258997,0.41975,0.985156,0.551706,0.701117,0.481122,0.572444,0.50175,0.595522
2,1.882493,2.296358,1.820313,0.0,0.276557,2.100783,0.0,0.782067,2.315749,1.801558,...,1.518036,2.342039,0.878664,1.87057,0.458989,0.815765,0.916195,1.371374,0.467082,0.549236
3,0.943663,1.610042,1.23101,0.0,0.0,1.238736,0.0,0.0,1.744605,4.986821,...,0.749123,1.686733,0.605474,1.300031,0.564768,0.76896,0.859444,0.840048,0.516354,0.423985
4,1.225594,0.943958,0.967144,0.0,0.0,0.796202,0.0,0.0,0.830947,0.663942,...,0.751067,1.080428,0.472277,0.640656,0.041527,0.467175,0.667589,0.645448,0.259365,0.0
5,0.367637,0.516374,0.461735,0.0,0.158683,0.494873,0.0,0.0,0.448637,0.287895,...,0.627831,0.845977,0.214371,0.352597,0.22987,0.231842,0.178085,0.137016,0.087035,0.050921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.830877,1.362638,1.08666,0.0,0.0,1.227517,0.0,0.52577,5.0,0.98491,...,0.657234,1.218439,0.642377,0.763817,0.404599,0.520317,0.216684,0.819102,0.431293,0.106647
6105,1.092415,0.981203,0.834042,0.0,0.0,0.861914,0.0,0.0,1.073486,0.524578,...,0.981754,1.241312,0.581072,0.624441,0.086908,0.182269,0.364533,0.631151,0.501638,0.298573
6106,1.516198,2.433324,2.070122,0.0,0.912828,2.243131,0.0,0.0,2.099888,1.33276,...,1.654897,2.692266,0.934089,2.3045,0.411272,1.005892,1.336874,1.026511,1.165746,0.573168
6107,1.6489,2.189671,1.402795,0.0,1.006491,2.010242,0.0,0.780698,1.761909,1.271197,...,1.727559,2.220622,0.693456,1.501911,0.06459,0.942014,0.418946,0.808119,0.397227,0.713018


In [146]:
# # 사용자가 평점을 부여한 골프장에 대해서만 예측 성능 평가 MSE 를 구함. 
# def get_mse(pred, actual):
#     # Ignore nonzero terms.
#     pred = pred[actual.nonzero()].flatten()
#     actual = actual[actual.nonzero()].flatten()
#     return mean_squared_error(pred, actual)

print('아이템 기반 모든 인접 이웃 MSE: ', get_mse(item_prediction_result_df.values, sparse_matrix.fillna(0).values))

아이템 기반 모든 인접 이웃 MSE:  11.148817119642583


### top-n 유사도를 가진 데이터들에 대해서만 예측 평점 계산
- MSE 값을 감소하기 위해 특정 골프장과 가장 비슷한 유사도를 가지는 골프장에 대해서만 유사도 벡터를 적용

In [148]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=10):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)

    # 사용자-아이템 평점 행렬의 열 크기만큼 Loop 수행. 
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 index 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T) 
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))        
    return pred

In [163]:
# 실행시간 2분 정도 걸림
item_rating_prediction = predict_rating_topsim(sparse_matrix.fillna(0).values, item_cossim_df.values, n=5)
print('아이템 기반 인접 TOP-5 이웃 MSE: ', get_mse(item_rating_prediction, sparse_matrix.fillna(0).values))

  pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
  pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-5 이웃 MSE:  9.571428025842971


In [162]:
# 계산된 예측 평점 데이터는 DataFrame으로 재생성
item_rating_prediction_matrix = pd.DataFrame(data=item_rating_prediction, index= sparse_matrix.index,
                                   columns = sparse_matrix.columns)
item_rating_prediction_matrix

cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,6.134212,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.900265,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.439045,0.575962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,5.508526,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6105,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6106,0.000000,0.0,0.0,0.0,0.000000,0.826143,0.000000,0.0,0.000000,0.000000,...,0.000000,0.834513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6107,0.861879,0.0,0.0,0.0,0.526704,0.000000,0.000000,0.0,0.000000,0.770634,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [171]:
# 추천에 앞서 2번 사용자가 높은 평점을 준 골프장을 확인
user_rating_id = sparse_matrix.loc[2, :]
user_rating_id[ user_rating_id > 0].sort_values(ascending=False)[:5]

cc_num
75    10.0
15     8.0
13     7.5
Name: 2, dtype: float64

#### 사용자가 방문하지 않은 골프장 중에서 추천해보자
- user_rating이 0보다 크면 기존에 방문한 골프장이라는 점을 이용해서 계산

In [167]:
def get_unseen_cc(ratings_matrix, id_num):
    # id_num으로 입력받은 사용자의 모든 골프장 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 골프장명(cc_name)을 index로 가지는 Series 객체임. 
    user_rating = sparse_matrix.loc[id_num,:]
    
    # user_rating이 0보다 크면 기존에 방문한 골프장. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 골프장을 list 객체로 만듬.
    cc_list = sparse_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 cc는 cc_list에서 제외함. 
    unseen_list = [cc for cc in cc_list if cc not in already_seen]
    
    return unseen_list

In [168]:
# pred_df : 앞서 계산된 골프장별 예측 평점
# unseen_list : 사용자가 방문하지 않은 골프장
# top_n : 상위 n개를 가져온다.

def recomm_cc_by_id(pred_df, id_num, unseen_list, top_n=5):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 골프장명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_cc = pred_df.loc[id_num, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_cc

In [181]:
# 사용자가 방문하지 않은 골프장 추출   
unseen_list = get_unseen_cc(sparse_matrix, 2)

# 아이템 기반의 인접 이웃 협업 필터링으로 골프장 추천 
recomm_10_cc = recomm_cc_by_id(item_prediction_result_df, 2, unseen_list, top_n=5)

# 평점 데이타를 DataFrame으로 생성. 
recomm_10_cc = pd.DataFrame(data=recomm_10_cc.values, index=recomm_10_cc.index, columns=['pred_score'])
recomm_10_cc

Unnamed: 0_level_0,pred_score
cc_num,Unnamed: 1_level_1
68,2.658183
93,2.611767
16,2.520844
79,2.453996
51,2.409353


## CF - MF기반 잠재요인 CF

In [50]:
# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_cc_info.pivot_table('cc_score', index='id_num', columns='cc_name', aggfunc=max)
ratings_matrix

print(ratings_matrix.shape)
ratings_matrix.head()

(6108, 137)


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,9.5,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [51]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]

    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [52]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # P와 Q 매트릭스를 계속 업데이트(확률적 경사하강법)
    for step in tqdm(range(steps)):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [53]:
%%time
# 경사하강법을 이용한 행렬 분해
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.001, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)

  0%|          | 0/200 [00:00<?, ?it/s]

### iteration step :  0  rmse :  8.943057139214142
### iteration step :  10  rmse :  8.420151500953512
### iteration step :  20  rmse :  3.5946707218832397
### iteration step :  30  rmse :  2.080241024875369
### iteration step :  40  rmse :  1.499420566400949
### iteration step :  50  rmse :  1.2306587657367032
### iteration step :  60  rmse :  1.0816399409916493
### iteration step :  70  rmse :  0.9815611391618155
### iteration step :  80  rmse :  0.9025060399651013
### iteration step :  90  rmse :  0.8341311702910309
### iteration step :  100  rmse :  0.7727680966283117
### iteration step :  110  rmse :  0.7169637740265568
### iteration step :  120  rmse :  0.6659173609045725
### iteration step :  130  rmse :  0.6190506084226333
### iteration step :  140  rmse :  0.5759150849352659
### iteration step :  150  rmse :  0.536158959310684
### iteration step :  160  rmse :  0.49950160826638024
### iteration step :  170  rmse :  0.46571332090192813
### iteration step :  180  rmse :  0.43459

In [54]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)
# 예측 평점 행렬 확인
print(ratings_pred_matrix.shape)
ratings_pred_matrix.head(15)

(6108, 137)


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.682319,8.546985,8.164804,2.445426,1.836407,7.351236,2.253563,2.485215,7.806419,8.720424,...,8.015363,7.991532,7.225967,8.430541,7.934731,8.328245,8.255714,7.834611,8.062451,8.300939
2,8.531998,8.975374,7.97312,2.368879,1.827173,6.603919,2.134654,2.40712,7.570664,8.331159,...,7.553759,7.201433,7.228746,8.297386,7.648491,8.069964,8.084432,7.621092,7.79569,8.352654
3,9.034939,8.937357,8.611895,2.522482,1.93291,7.705067,2.294843,2.597785,8.325531,9.22369,...,8.359674,8.212793,7.611798,9.014598,8.358066,8.652152,8.705304,8.210216,8.491922,8.806735
4,9.306657,9.404543,8.87056,2.570771,1.982333,8.009465,2.453588,2.78153,8.482916,9.397621,...,8.444299,9.03617,8.000701,9.541604,8.598647,9.040342,9.097724,8.396912,8.840579,9.003787
5,4.057648,4.024262,3.664236,1.15259,0.832999,3.503288,1.061885,1.188627,3.545249,4.038909,...,4.018795,3.956827,3.405173,4.061363,3.772768,3.896152,3.924611,3.732303,3.791287,3.960101
6,8.457858,8.006967,7.885323,2.349891,1.744761,5.792202,2.127796,2.353465,6.764482,8.620819,...,8.688687,8.54766,7.279587,7.652607,8.020909,8.290771,8.356524,8.215999,8.351023,8.545854
7,10.175757,10.191196,9.539902,2.87659,2.144967,8.933681,2.700628,2.990253,9.308178,10.359235,...,9.846029,9.762405,8.794131,10.442574,9.502781,9.978829,10.003614,9.483668,9.828363,10.054593
8,9.359441,9.075791,8.699873,2.602574,1.953421,7.904202,2.36493,2.624257,8.681233,9.247839,...,8.422209,8.358274,7.708976,9.178396,8.484773,8.92822,8.869672,8.491987,8.620653,9.017144
9,7.026467,6.959096,6.637769,2.048308,1.566735,6.358448,1.880297,2.079368,6.849514,7.133191,...,6.656298,6.550496,6.078334,7.141596,6.572754,6.911776,7.002669,6.548066,6.828327,7.038106
10,8.026788,8.161315,7.771281,2.265602,1.673876,7.144984,2.058462,2.378536,7.066807,8.291795,...,7.527193,7.05367,6.964024,8.228794,7.75718,7.907212,7.964077,7.408773,7.619947,7.803706


In [55]:
# 원본 행렬 확인
print(ratings_matrix.shape)
ratings_matrix.head(15)

(6108, 137)


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,9.5,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,6.0,,,7.0,,...,9.0,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


### 2번 사용자에게 예측 평점이 높은 미방문 골프장 추천

In [56]:
# 사용자가 방문하지 않은 골프장 추출   
unseen_list = get_unseen_cc(ratings_matrix, 2)

# 아이템 기반의 인접 이웃 협업 필터링으로 골프장 추천 
recomm_10_cc = recomm_cc_by_id(ratings_pred_matrix, 2, unseen_list, top_n=10)

# 평점 데이타를 DataFrame으로 생성. 
recomm_10_cc = pd.DataFrame(data=recomm_10_cc.values, index=recomm_10_cc.index, columns=['pred_score'])
recomm_10_cc

Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
안성베네스트,9.050252
청평마이다스,9.03766
페럼,9.021302
88,8.975374
자유,8.937446
일동레이크,8.914824
글렌로스(P9),8.907501
블랙스톤,8.900461
뉴서울,8.798927
스카이밸리,8.795132
