In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
path = './data/'
data = pd.read_csv(os.path.join(path, 'xgolf_contents_final.csv'), encoding='utf-8')

data # 데이터 확인

Unnamed: 0.1,Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
0,1,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장
1,2,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...
2,3,몽베르,경기북부의 몽베르,NV2*******,2022.07.23 10:04,2022,7,23,10:04,10.0,10,10,10,10,편안하고 관리가 잘된 골프코스에서 라운딩했네요
3,4,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...
4,5,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47823,47860,기흥,좋은골프장 좋은그린,NV5******,2018.01.01 18:46,2018,1,1,18:46,10.0,10,10,10,10,겨울에도 기흥은 그린이 좋아요.속도도 맘에듭니다
47824,47861,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다
47825,47862,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...
47826,47863,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...


In [3]:
data.info() # 개괄 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47828 entries, 0 to 47827
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   47828 non-null  int64  
 1   golf_name    47828 non-null  object 
 2   title        47828 non-null  object 
 3   id           47828 non-null  object 
 4   datetime     47828 non-null  object 
 5   year         47828 non-null  int64  
 6   month        47828 non-null  int64  
 7   day          47828 non-null  int64  
 8   time         47828 non-null  object 
 9   golfscore    47828 non-null  float64
 10  caddiescore  47828 non-null  int64  
 11  coursescore  47828 non-null  int64  
 12  pricescore   47828 non-null  int64  
 13  facility     47828 non-null  int64  
 14  contents     47828 non-null  object 
dtypes: float64(1), int64(8), object(6)
memory usage: 5.5+ MB


### 데이터 전처리

In [4]:
# 중복 행 확인
data[(data.duplicated())]

Unnamed: 0.1,Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents


In [5]:
# 불필요한 컬럼 제거
data = data.drop(columns='Unnamed: 0').reset_index(drop=True)
data.head()

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
0,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장
1,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...
2,몽베르,경기북부의 몽베르,NV2*******,2022.07.23 10:04,2022,7,23,10:04,10.0,10,10,10,10,편안하고 관리가 잘된 골프코스에서 라운딩했네요
3,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...
4,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...


In [6]:
# 컬럼 제거 후 중복 행 다시 확인
data[(data.duplicated())]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
81,그린힐,최고의 캐디분을 만난 날,don*******,2022.07.04 12:26,2022,7,4,12:26,8.5,10,8,10,6,폭염으로 아주 힘든 라운딩였지만 그래도 캐디님의 밝은 웃음과 성실한 태도에 정말 감...
302,샴발라,중간이상,kaa***,2022.05.26 09:15,2022,5,26,09:15,6.5,10,8,4,4,코스가 어렵지만 재미있었어요
1421,서원힐스,겨울골프,kin****,2022.02.04 09:44,2022,2,4,09:44,9.0,10,10,8,8,올해 첫 라운딩. 부킹할 때만해도 조금은 덜 추우리라 예상했지만 역시 겨울은 겨울....
2384,스카이밸리,？？,yh0***,2021.09.12 11:05,2021,9,12,11:05,9.5,10,8,10,10,코스 관리 최고 그린 관리가 조금 아쉽네요 ??
3090,강남300,다 좋았네요,KK1*********,2021.07.31 13:59,2021,7,31,13:59,10.0,10,10,10,10,가격대비 관리도 잘 되어있고 ??
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47823,기흥,좋은골프장 좋은그린,NV5******,2018.01.01 18:46,2018,1,1,18:46,10.0,10,10,10,10,겨울에도 기흥은 그린이 좋아요.속도도 맘에듭니다
47824,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다
47825,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...
47826,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...


In [7]:
# 중복 행 (18073) 제거
data = data.drop_duplicates()
data

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
0,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장
1,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...
2,몽베르,경기북부의 몽베르,NV2*******,2022.07.23 10:04,2022,7,23,10:04,10.0,10,10,10,10,편안하고 관리가 잘된 골프코스에서 라운딩했네요
3,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...
4,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38083,기흥,좋은골프장 좋은그린,NV5******,2018.01.01 18:46,2018,1,1,18:46,10.0,10,10,10,10,겨울에도 기흥은 그린이 좋아요.속도도 맘에듭니다
38084,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다
38085,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...
38086,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...


In [8]:
# 같은 아이디로 같은 제목과 내용의 글을 올린 경우 있는지 확인
data_dup = data[(data.duplicated(['id', 'title', 'contents']))]
data_dup

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
80,그린힐,최고의 캐디분을 만난 날,don*******,2022.07.04 12:26,2022,7,4,12:26,8.5,10,8,10,6,폭염으로 아주 힘든 라운딩였지만 그래도 캐디님의 밝은 웃음과 성실한 태도에 정말 감...
7244,레이크사이드,좋아요,kei*****,2020.10.04 11:22,2020,10,4,11:22,10.0,10,10,10,10,역시 레이크사이드♥?
15927,자유,자유cc 후기,khl****,2019.03.07 21:12,2019,3,7,21:12,9.0,10,8,10,8,평일오후 선결제로 저렴하게 다녀왔어요.\n겨울이 끝나가는데 날씨도 좋고 밀리지도 않...
16034,레이크힐스안성(P9),가성비굿,kks*****,2019.03.03 20:28,2019,3,3,20:28,10.0,10,10,10,10,가격대비 아주좋습니다
16771,88,88,act*****,2018.12.17 15:07,2018,12,17,15:07,10.0,10,10,10,10,너무 좋아요 최고에요
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37021,아일랜드,잘치고ㅡ,lhy****,2018.03.06 11:11,2018,3,6,11:11,10.0,10,10,10,10,날씨도넘좋았고ㅡ동반자들도좋아했다ㅡ\n나중에또오고십다는말을
37215,블루헤런,블루헤런을 다녀와서~^^,oza**,2018.03.02 11:11,2018,3,2,11:11,10.0,10,10,10,10,가성비날씨그리고 캐디까지 친절해서 즐거운 롼딩이였네요 ^^
37265,써닝포인트,최악이었네요,NV4*******,2018.03.01 10:06,2018,3,1,10:06,7.5,8,6,8,8,캐디가 착해서 그냥 재밌게치고 오자했지만\n시작부터 티업시간도 되지 않았는데 짜증나...
37275,남서울,코스관리 잘 되어있는 골프장,plo***,2018.02.28 22:25,2018,2,28,22:25,10.0,10,10,10,10,캐디도 좋았고 코스도 좋았습니딘


In [9]:
# 위 경우 (103) 리뷰 신뢰도가 낮을것으로 판단 => 삭제
data = data.drop_duplicates(['id', 'title', 'contents'])
data.shape

(29652, 14)

In [10]:
# 중복 행 확인
data[(data.duplicated())]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents


In [11]:
data['id'].nunique() # unique id 개수 확인

6134

In [12]:
data['id'].value_counts() # id별 리뷰 개수 확인(소셜로그인 계정 - NV: Naver, KK: Kakao)

NV2*******      889
KK1*********    808
NV3*******      685
NV1*******      549
NV4*******      479
               ... 
042*              1
dea*****          1
osc****           1
pdw****           1
djk****           1
Name: id, Length: 6134, dtype: int64

In [13]:
# 네이버 계정 사용 리뷰 확인
data[data['id'].str.contains("NV", case=True)]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
2,몽베르,경기북부의 몽베르,NV2*******,2022.07.23 10:04,2022,7,23,10:04,10.0,10,10,10,10,편안하고 관리가 잘된 골프코스에서 라운딩했네요
14,서서울,언제나좋은 서서울,NV4*******,2022.07.20 11:19,2022,7,20,11:19,9.5,10,10,10,8,음식가격비싼거만빼면 너무좋은 코스관리등
18,크리스탈밸리,칭찬합니다.,NV9*******,2022.07.19 15:43,2022,7,19,15:43,8.5,10,8,8,8,더운 날씨\n짜증 한 번 없이 열심히 해 준\n캐디 ?혜원 양 칭찬합니다.
21,360도,귿,NV5*******,2022.07.18 11:10,2022,7,18,11:10,10.0,10,10,10,10,귿 또 방문할 예정입니다!
22,서서울,서서울cc,NV1*******,2022.07.18 11:08,2022,7,18,11:08,7.5,6,10,8,6,그린피가 비싼만큼 코스관리는 훌륭했습니다\n다만 캐디님이 신입이셔서.... 캐디피가...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38055,레이크사이드,겨울 추운 날,NV_********,2018.01.03 16:38,2018,1,3,16:38,9.0,6,10,10,10,코스는 얼었어도 재미있음 영상기온을 기다리며~~~
38056,기흥,기흥cc 후기,NV2*******,2018.01.03 15:28,2018,1,3,15:28,8.0,8,8,8,8,12월 30일에 갔는데 날씨도 따뜻해서 좋았습니다\n그린이 얼어서 볼이 튀기도 하고...
38081,솔모로,2017마지막 라운딩,NV2*******,2018.01.01 21:20,2018,1,1,21:20,10.0,10,10,10,10,12/29일 진행한 2인 라운딩\n그주에 가장 따뜻한 하루여서 아주아주 행복하게 게...
38082,은화삼,겨울골프 중 최고,NV8******,2018.01.01 20:24,2018,1,1,20:24,10.0,10,10,10,10,최고의 날씨에서 즐거운 란딩했어요


In [14]:
# 카카오 계정 사용 리뷰 확인
data[data['id'].str.contains("KK", case=True)]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents
7,이포,이포CC 좋아요,KK2*********,2022.07.21 13:06,2022,7,21,13:06,10.0,10,10,10,10,아늑하고 편안한 라운딩 되었어요\n\n재방문 의사 있어요
16,포레스트힐(P6),연습 겸 2인 라운딩,KK1*********,2022.07.19 15:57,2022,7,19,15:57,10.0,10,10,10,10,노캐디\n연습 2인 라운딩 잘 다녀왔습니다\n가깝고 저렴하고 좋아요
25,중부,"필드는 명문, 캐디는 최악",KK1*********,2022.07.18 07:24,2022,7,18,07:24,8.0,2,10,10,10,역시나 필드 관리나 클럽하우스 시설은 최고로 좋았지만..\n캐디 때문에 완전 기분 ...
26,이포,최악 이포,KK1*********,2022.07.17 14:36,2022,7,17,14:36,4.0,2,8,4,2,"장점 : 근거리, 넓은 페어웨이, 잔디 상태\n단점 : 최악 캐디(고령, 불친절, ..."
28,해솔리아,굿,KK2*********,2022.07.17 11:21,2022,7,17,11:21,10.0,10,10,10,10,인터넷 이용자 댓글보고 기대도 안했는데 아두 만족함
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10262,한림광릉,굿..,KK1*********,2020.04.16 11:27,2020,4,16,11:27,10.0,10,10,10,10,부킹을쉽게할수있어서조앗구\n시설두아주깔끔하구잔디상태완전조앗습니다.
10611,크리스탈밸리,날씨도 좋고 기분좋게 다녀왓어요~!!,KK1*********,2020.03.24 18:25,2020,3,24,18:25,8.5,8,8,8,10,크리스탈밸리 너무 기분좋게 다녀왔어요ㅎㅎ\n날씨도 좋고 클럽하우스 음식도 너무 맛있...
11309,서원힐스,서원 야간 다녀왔습니다~^^,KK1*********,2019.11.21 00:00,2019,11,21,00:00,9.0,8,10,8,10,야간 라운딩 했어요~\n날씨도 좋고 코스도 좋고 즐거운 란딩했습니다~
13243,서원힐스,골뱅이 무침이 짱이에요~,KK1*********,2019.07.11 00:00,2019,7,11,00:00,9.5,8,10,10,10,그늘집에서 골뱅이 무침이랑 막걸리 한잔하면 크~ 홀인원 할수있을 정도로 최곱니다 ㅎㅎ


In [15]:
# 소셜로그인 계정 (4010 + 848) 리뷰 삭제(과도한 중복 피하기) => 행 개수: 29652 - (4010 + 848) = 24794
data = data[~((data['id'].str.contains('NV', case=True)) | (data['id'].str.contains('KK', case=True)))]
data.shape

(24794, 14)

In [16]:
data[(data.duplicated())]

Unnamed: 0,golf_name,title,id,datetime,year,month,day,time,golfscore,caddiescore,coursescore,pricescore,facility,contents


In [17]:
# Base DF 생성
data.rename(columns={'golf_name':'cc_name', 'golfscore':'cc_score', 'caddiescore':'caddie', 'coursescore':'course', 'pricescore': 'price'}, inplace=True)
print(data.shape)
data.head()

(24794, 14)


Unnamed: 0,cc_name,title,id,datetime,year,month,day,time,cc_score,caddie,course,price,facility,contents
0,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장
1,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...
3,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...
4,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...
5,썬힐,정말 해도해도 너무하네요,cli*****,2022.07.22 09:30,2022,7,22,09:30,3.5,8,2,2,2,좋은 날 기분 더럽게 다녀왔네요.. 코스관리 개엉망이고 그린은 완전 걸레짝. 카운터...


### 사용자 데이터프레임 생성

In [18]:
# 전처리 후 unique id 개수 확인
data['id'].nunique()

6111

In [19]:
# 사용자 일련번호를 부여한 USER DF 생성
data_id = data['id'].sort_values().drop_duplicates().reset_index()
data_id = data_id.rename(columns={'index':'id_num'})
data_id['id_num'] = range(1,len(data_id)+1)
data_id

Unnamed: 0,id_num,id
0,1,002***
1,2,003****
2,3,007***
3,4,007*****
4,5,010*******
...,...,...
6106,6107,zzi****
6107,6108,zzi*********
6108,6109,zzu****
6109,6110,zzz*


### 골프장 데이터프레임 생성

In [20]:
# 골프장 DF 생성
data_cc = data['cc_name'].sort_values().drop_duplicates().reset_index()
data_cc = data_cc.rename(columns={'index':'cc_num'})
data_cc['cc_num'] = range(1,len(data_cc)+1)
data_cc

Unnamed: 0,cc_num,cc_name
0,1,360도
1,2,88
2,3,H1 CLUB(구 덕평)
3,4,XGOLF
4,5,XGOLF(테스트)
...,...,...
134,135,한양파인(P9)
135,136,한원
136,137,해솔리아
137,138,화성(P9)


In [21]:
data_cc.head(10)

Unnamed: 0,cc_num,cc_name
0,1,360도
1,2,88
2,3,H1 CLUB(구 덕평)
3,4,XGOLF
4,5,XGOLF(테스트)
5,6,가산노블리스
6,7,가평베네스트
7,8,강남300
8,9,고양컨트리클럽(P9)
9,10,곤지암


In [22]:
# XGOLF는 골프장이 아닌 테스트용 인것으로 보임
data_test = data[(data['cc_name'].str.contains('XGOLF'))]
data_test.shape

(27, 14)

In [23]:
# 추가 전처리 필요... XGOLF 삭제하여 Base DF 변경 -> 행 개수: 24794 - 27 = 24767
data = data[~(data['cc_name'].str.contains('XGOLF'))]
data.shape

(24767, 14)

In [24]:
# Base DF 변경에 따른 USER DF 재생성
data_id = data['id'].sort_values().drop_duplicates().reset_index()
data_id = data_id.rename(columns={'index':'id_num'})
data_id['id_num'] = range(1,len(data_id)+1)
data_id

Unnamed: 0,id_num,id
0,1,002***
1,2,003****
2,3,007***
3,4,007*****
4,5,010*******
...,...,...
6103,6104,zzi****
6104,6105,zzi*********
6105,6106,zzu****
6106,6107,zzz*


In [25]:
# id 정보 csv 추출
# data_id.to_csv('./data/xgolf_user_completed.csv')

In [26]:
# Base DF 변경에 따른 골프장 DF 재생성
data_cc = data['cc_name'].sort_values().drop_duplicates().reset_index()
data_cc = data_cc.rename(columns={'index':'cc_num'})
data_cc['cc_num'] = range(1,len(data_cc)+1)
data_cc

Unnamed: 0,cc_num,cc_name
0,1,360도
1,2,88
2,3,H1 CLUB(구 덕평)
3,4,가산노블리스
4,5,가평베네스트
...,...,...
132,133,한양파인(P9)
133,134,한원
134,135,해솔리아
135,136,화성(P9)


In [27]:
# 골프장 정보 csv 추출
# data_cc.to_csv('./data/xgolf_cc_completed.csv')

In [28]:
# Base DF에 id_num 및 cc_num 삽입
data = pd.merge(data, data_id, on = 'id', how = 'left')
data = pd.merge(data, data_cc, on = 'cc_name', how = 'left')
data

Unnamed: 0,cc_name,title,id,datetime,year,month,day,time,cc_score,caddie,course,price,facility,contents,id_num,cc_num
0,금강,굿 금강,yun****,2022.07.23 11:13,2022,7,23,11:13,10.0,10,10,10,10,페어웨이 그린 베스트 명품구장,6029,15
1,비에이비스타퍼블릭,"양프로,티칭프로테스트를 앞둔 예비프로와 함께",Wls******,2022.07.23 11:08,2022,7,23,11:08,10.0,10,10,10,10,라운딩 체험으로 운동했는데 날씨도 햇빛 없는 날과함께 즐거웠습니다\n모든 좋았으나한...,186,53
2,스카이밸리,재촉만 하지 않았으면 최고였을텐데...,bab***,2022.07.22 23:11,2022,7,22,23:11,10.0,10,10,10,10,아침부터 비가 온 것을 제외하고는 좋았습니다.\n잘 관리된 페어웨이와 그린 그리고 ...,460,68
3,몽베르,몽베르라운딩후기,ljk******,2022.07.22 11:25,2022,7,22,11:25,10.0,10,10,10,10,첫홀은 비가와서 징행할까 말까 했었는데\n그래도 멀리왔는데 치고가자 고 일행이 적극...,3498,44
4,썬힐,정말 해도해도 너무하네요,cli*****,2022.07.22 09:30,2022,7,22,09:30,3.5,8,2,2,2,좋은 날 기분 더럽게 다녀왔네요.. 코스관리 개엉망이고 그린은 완전 걸레짝. 카운터...,887,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24762,스카이밸리,마운틴코스라기에. . .,ana*****,2018.01.01 21:50,2018,1,1,21:50,9.5,10,8,10,10,눈이라도 많을까봐 걱정했는데.\n플레이하기에 전혀 지장이 없는\n상태로 코스는 잘 ...,333,68
24763,아일랜드,아일랜드CC,dda****,2018.01.01 14:58,2018,1,1,14:58,10.0,10,10,10,10,바람이 좀 춥긴했지만 관리도 잘되있고 좋았습니다,1097,78
24764,기흥,눈 속에서,l3l***,2018.01.01 12:13,2018,1,1,12:13,9.5,10,10,8,10,눈이 오는 바람에 정상적인 라운드를 할 수 없었지만 너무 좋은 멤버들과 캐디 덕분에...,3355,16
24765,88,2017년 마지막 라운딩,pol*******,2018.01.01 04:17,2018,1,1,04:17,9.0,8,8,10,10,다행히 날씨가 춥지 않아 즐거운 라운딩이었음.\n11:32 티업. 전반 지나고 그늘...,4386,2


In [29]:
# 데이터 병합 확인
print(data['id_num'].isnull().sum(), data['cc_num'].isnull().sum())

0 0


In [30]:
# 혹시 모를 중복 확인
data[(data.duplicated())]

Unnamed: 0,cc_name,title,id,datetime,year,month,day,time,cc_score,caddie,course,price,facility,contents,id_num,cc_num


In [31]:
# 전처리 끝난 최종 데이터 csv 추출
# data.to_csv('./data/xgolf_contents_completed.csv')

## CF - KNN

In [32]:
# 우선 필요한 데이터만 발췌
ratings_cc_df = data[['id_num', 'cc_num', 'cc_score']].sort_values(by=['id_num','cc_num','cc_score']).reset_index(drop=True)
ratings_cc_df

Unnamed: 0,id_num,cc_num,cc_score
0,1,51,10.0
1,2,13,7.5
2,2,15,8.0
3,2,75,10.0
4,3,10,9.5
...,...,...,...
24762,6106,82,8.0
24763,6107,68,9.0
24764,6107,126,9.5
24765,6108,129,10.0


### Sparse Matrix - Item based

In [33]:
# item = cc 
sparse_matrix = ratings_cc_df.pivot_table('cc_score', index='cc_num', columns='id_num', aggfunc='mean')

print(sparse_matrix.shape)
sparse_matrix

(137, 6108)


id_num,1,2,3,4,5,6,7,8,9,10,...,6099,6100,6101,6102,6103,6104,6105,6106,6107,6108
cc_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,,,,,,,,,,,...,,,,,,,,,,
134,,,,,,,,,,,...,,,,,,,,,,
135,,,,,,,,,,,...,,,,,,,,,,
136,,,,,,,,,,,...,,,,,,,,,,


In [34]:
item_sparse_matrix = sparse_matrix.fillna(0)
item_sparse_matrix.shape

(137, 6108)

### 코사인 유사도 활용

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

# 코사인 유사도 산출 함수
def cossim_matrix(a, b):
    cossim_values = cosine_similarity(a.values, b.values)
    cossim_df = pd.DataFrame(data=cossim_values, columns = a.index.values, index=a.index)

    return cossim_df

In [36]:
# cc간 코사인 유사도 matrix df
item_cossim_df = cossim_matrix(item_sparse_matrix, item_sparse_matrix)
item_cossim_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
cc_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.127315,0.083301,0.0,0.0,0.080223,0.0,0.076589,0.089323,0.051565,...,0.119028,0.108239,0.022208,0.036934,0.020085,0.015109,0.063174,0.070750,0.000000,0.021922
2,0.127315,1.000000,0.121103,0.0,0.0,0.165174,0.0,0.046346,0.152532,0.088981,...,0.061556,0.137236,0.040934,0.153491,0.030233,0.049017,0.053266,0.031397,0.026264,0.045146
3,0.083301,0.121103,1.000000,0.0,0.0,0.106111,0.0,0.097707,0.120663,0.105919,...,0.082755,0.089859,0.054910,0.120163,0.043559,0.065348,0.053949,0.025347,0.037731,0.015382
4,0.000000,0.000000,0.000000,1.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.0,1.0,0.000000,0.0,0.000000,0.000000,0.000000,...,0.074358,0.041324,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0.015109,0.049017,0.065348,0.0,0.0,0.072396,0.0,0.000000,0.056063,0.033312,...,0.056003,0.047626,0.039003,0.014750,0.026684,1.000000,0.063751,0.000000,0.016251,0.042349
134,0.063174,0.053266,0.053949,0.0,0.0,0.064732,0.0,0.000000,0.022292,0.040795,...,0.019949,0.028375,0.102120,0.058563,0.000000,0.063751,1.000000,0.043373,0.052212,0.000000
135,0.070750,0.031397,0.025347,0.0,0.0,0.042029,0.0,0.000000,0.087234,0.058366,...,0.025494,0.043039,0.044781,0.051965,0.000000,0.000000,0.043373,1.000000,0.021170,0.000000
136,0.000000,0.026264,0.037731,0.0,0.0,0.000000,0.0,0.000000,0.046209,0.030896,...,0.035050,0.053446,0.014482,0.022925,0.049841,0.016251,0.052212,0.021170,1.000000,0.000000


In [37]:
# CC: 137개, ID: 6108개
# ratings_cc_df에 포함된 id_num를 계산에 반영한다
id_num_grouped = ratings_cc_df.groupby('id_num')
# index: id_num, columns: total cc_num
item_prediction_result_df = pd.DataFrame(index=list(id_num_grouped.indices.keys()), columns=item_sparse_matrix.index)
item_prediction_result_df

cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [39]:
for id_num, group in tqdm(id_num_grouped):
    # id_num에 해당하는 골퍼가 평가한 cc * 전체 cc
    user_sim = item_cossim_df.loc[group['cc_num']]
    # id_num에 해당하는 골퍼가 평가한 cc * 1
    user_rating = group['cc_score']
    # 전체 cc * 1
    sim_sum = user_sim.sum(axis=0)

    # userId의 전체 rating predictions (8938 * 1)
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    item_prediction_result_df.loc[id_num] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for id_num, group in tqdm(id_num_grouped):


  0%|          | 0/6108 [00:00<?, ?it/s]

In [41]:
item_prediction_result_df

cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
1,1.003954,1.432387,0.825223,0.0,0.454371,1.144316,0.0,0.0,0.959027,0.801813,...,0.839423,1.242538,0.418595,0.969129,0.552634,0.710348,0.483014,0.574779,0.496594,0.591782
2,1.873961,2.281312,1.819262,0.0,0.279146,2.094165,0.0,0.789521,2.272896,1.804871,...,1.514157,2.297985,0.869882,1.850747,0.453309,0.826471,0.906341,1.336531,0.473409,0.553037
3,0.948996,1.585248,1.233794,0.0,0.0,1.207191,0.0,0.0,1.73463,4.985101,...,0.753127,1.659819,0.604694,1.277155,0.55593,0.781057,0.852551,0.832288,0.491718,0.422473
4,1.228821,0.929876,0.944287,0.0,0.0,0.782795,0.0,0.0,0.801118,0.652985,...,0.75418,1.061037,0.43723,0.63311,0.04194,0.475331,0.669298,0.622957,0.251435,0.0
5,0.356246,0.500687,0.456787,0.0,0.160302,0.485161,0.0,0.0,0.439861,0.2903,...,0.61268,0.828813,0.210538,0.346566,0.229488,0.234204,0.177551,0.131509,0.088748,0.051556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.819986,1.323453,1.076711,0.0,0.0,1.204175,0.0,0.533564,5.0,0.979201,...,0.651911,1.20334,0.632934,0.748877,0.406955,0.53087,0.218055,0.802351,0.441684,0.108515
6105,1.08839,0.974983,0.827846,0.0,0.0,0.860554,0.0,0.0,1.070231,0.527388,...,0.963319,1.225921,0.577897,0.625432,0.08761,0.182357,0.356197,0.629858,0.506608,0.300753
6106,1.499331,2.418954,2.042315,0.0,0.91804,2.227619,0.0,0.0,2.098686,1.332751,...,1.637611,2.665188,0.931941,2.285172,0.38999,1.016908,1.337863,1.020972,1.171644,0.579665
6107,1.618556,2.172576,1.386937,0.0,0.987412,1.974042,0.0,0.783201,1.724157,1.270033,...,1.705956,2.188301,0.68035,1.494862,0.060064,0.911195,0.420515,0.816601,0.400026,0.70777
