In [21]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [None]:
path = './data/'
data = pd.read_csv(os.path.join(path, 'xgolf_contents_final.csv'), encoding='utf-8')

data # 데이터 확인

In [None]:
data.info() # 개괄 확인

### 데이터 전처리

In [None]:
# 중복 행 확인
data[(data.duplicated())]

In [None]:
# 불필요한 컬럼 제거
data = data.drop(columns='Unnamed: 0').reset_index(drop=True)
data.head()

In [None]:
# 컬럼 제거 후 중복 행 다시 확인
data[(data.duplicated())]

In [None]:
# 중복 행 (18073) 제거
data = data.drop_duplicates()
data

In [None]:
# 같은 아이디로 같은 제목과 내용의 글을 올린 경우 있는지 확인
data_dup = data[(data.duplicated(['id', 'title', 'contents']))]
data_dup

In [None]:
# 위 경우 (103) 리뷰 신뢰도가 낮을것으로 판단 => 삭제
data = data.drop_duplicates(['id', 'title', 'contents'])
data.shape

In [None]:
# 중복 행 확인
data[(data.duplicated())]

In [None]:
data['id'].nunique() # unique id 개수 확인

In [None]:
data['id'].value_counts() # id별 리뷰 개수 확인(소셜로그인 계정 - NV: Naver, KK: Kakao)

In [None]:
# 네이버 계정 사용 리뷰 확인
data[data['id'].str.contains("NV", case=True)]

In [None]:
# 카카오 계정 사용 리뷰 확인
data[data['id'].str.contains("KK", case=True)]

In [None]:
# 소셜로그인 계정 (4010 + 848) 리뷰 삭제(과도한 중복 피하기) => 행 개수: 29652 - (4010 + 848) = 24794
data = data[~((data['id'].str.contains('NV', case=True)) | (data['id'].str.contains('KK', case=True)))]
data.shape

In [None]:
data[(data.duplicated())]

In [None]:
# Base DF 생성
data.rename(columns={'golf_name':'cc_name', 'golfscore':'cc_score', 'caddiescore':'caddie', 'coursescore':'course', 'pricescore': 'price'}, inplace=True)
print(data.shape)
data.head()

### 사용자 데이터프레임 생성

In [None]:
# 전처리 후 unique id 개수 확인
data['id'].nunique()

In [None]:
# 사용자 일련번호를 부여한 USER DF 생성
data_id = data['id'].sort_values().drop_duplicates().reset_index()
data_id = data_id.rename(columns={'index':'id_num'})
data_id['id_num'] = range(1,len(data_id)+1)
data_id

### 골프장 데이터프레임 생성

In [None]:
# 골프장 DF 생성
data_cc = data['cc_name'].sort_values().drop_duplicates().reset_index()
data_cc = data_cc.rename(columns={'index':'cc_num'})
data_cc['cc_num'] = range(1,len(data_cc)+1)
data_cc

In [None]:
data_cc.head(10)

In [None]:
# XGOLF는 골프장이 아닌 테스트용 인것으로 보임
data_test = data[(data['cc_name'].str.contains('XGOLF'))]
data_test.shape

In [None]:
# 추가 전처리 필요... XGOLF 삭제하여 Base DF 변경 -> 행 개수: 24794 - 27 = 24767
data = data[~(data['cc_name'].str.contains('XGOLF'))]
data.shape

In [None]:
# Base DF 변경에 따른 USER DF 재생성
data_id = data['id'].sort_values().drop_duplicates().reset_index()
data_id = data_id.rename(columns={'index':'id_num'})
data_id['id_num'] = range(1,len(data_id)+1)
data_id

In [None]:
# id 정보 csv 추출
# data_id.to_csv('./data/xgolf_user_completed.csv')

In [None]:
# Base DF 변경에 따른 골프장 DF 재생성
data_cc = data['cc_name'].sort_values().drop_duplicates().reset_index()
data_cc = data_cc.rename(columns={'index':'cc_num'})
data_cc['cc_num'] = range(1,len(data_cc)+1)
data_cc

In [None]:
# 골프장 정보 csv 추출
# data_cc.to_csv('./data/xgolf_cc_completed.csv')

In [None]:
# Base DF에 id_num 및 cc_num 삽입
data = pd.merge(data, data_id, on = 'id', how = 'left')
data = pd.merge(data, data_cc, on = 'cc_name', how = 'left')
data

In [None]:
# 데이터 병합 확인
print(data['id_num'].isnull().sum(), data['cc_num'].isnull().sum())

In [None]:
# 혹시 모를 중복 확인
data[(data.duplicated())]

In [None]:
# 전처리 끝난 최종 데이터 csv 추출
# data.to_csv('./data/xgolf_contents_completed.csv')

In [10]:
path = './data/'
data = pd.read_csv(os.path.join(path, 'xgolf_contents_completed.csv'), encoding='utf-8').drop(columns='Unnamed: 0')
data_id = pd.read_csv(os.path.join(path, 'xgolf_user_completed.csv'), encoding='utf-8').drop(columns='Unnamed: 0')
data_cc = pd.read_csv(os.path.join(path, 'xgolf_cc_completed.csv'), encoding='utf-8').drop(columns='Unnamed: 0')

## CF - KNN

In [14]:
# 우선 필요한 데이터만 발췌
ratings_cc_df = data[['id_num', 'cc_num', 'cc_score']]
ratings_cc_df

Unnamed: 0,id_num,cc_num,cc_score
0,6029,15,10.0
1,186,53,10.0
2,460,68,10.0
3,3498,44,10.0
4,887,73,3.5
...,...,...,...
24762,333,68,9.5
24763,1097,78,10.0
24764,3355,16,9.5
24765,4386,2,9.0


In [15]:
# null 체크
ratings_cc_df.isnull().sum()

id_num      0
cc_num      0
cc_score    0
dtype: int64

In [16]:
# pivot_table 메소드를 사용해서 행렬 변환
r_matrix = ratings_cc_df.pivot_table('cc_score', index='id_num', columns='cc_num')

print(r_matrix.shape)
r_matrix

(6108, 137)


cc_num,1,2,3,4,5,6,7,8,9,10,...,128,129,130,131,132,133,134,135,136,137
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,9.5,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,10.0,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# title 컬럼을 얻기 이해 movies 와 조인 수행
rating_cc_info = pd.merge(ratings_cc_df, data_cc, on='cc_num')
rating_cc_info

Unnamed: 0,id_num,cc_num,cc_score,cc_name
0,6029,15,10.0,금강
1,5049,15,8.0,금강
2,2005,15,10.0,금강
3,5019,15,8.5,금강
4,5036,15,10.0,금강
...,...,...,...,...
24762,230,59,10.0,서원밸리
24763,2974,115,10.0,파인비치
24764,230,8,10.0,곤지암
24765,230,18,10.0,남부


In [18]:
# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_cc_info.pivot_table('cc_score', index='id_num', columns='cc_name')
ratings_matrix

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,9.5,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,,,,,,,,,10.0,,...,,,,,,,,,,
6105,,,,,,,,,,,...,,,,,,,,,,
6106,,,,,,,,,,,...,,,,,,,,,,
6107,,,,,,,,,,,...,,,,,,,,,,


In [19]:
# NaN 값을 모두 0 으로 변환
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# 아이템-사용자 행렬로 transpose 한다.
ratings_matrix_T = ratings_matrix.transpose()    # 전치 행렬

print(ratings_matrix_T.shape)
ratings_matrix_T.head(5)

(137, 6108)


id_num,1,2,3,4,5,6,7,8,9,10,...,6099,6100,6101,6102,6103,6104,6105,6106,6107,6108
cc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
360도,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H1 CLUB(구 덕평),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가산노블리스,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가평베네스트,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 코사인 유사도

In [22]:
# 골프장들 간 코사인 유사도 산출
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

# cosine_similarity()로 반환된 넘파이 행렬을 골프장에 매핑하여 DataFrame으로 변환
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,
                          columns=ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

(137, 137)


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
cc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
360도,1.0,0.127315,0.083301,0.0,0.0,0.080223,0.0,0.076589,0.089323,0.051565,...,0.119028,0.108239,0.022208,0.036934,0.020085,0.015109,0.063174,0.07075,0.0,0.021922
88,0.127315,1.0,0.121103,0.0,0.0,0.165174,0.0,0.046346,0.152532,0.088981,...,0.061556,0.137236,0.040934,0.153491,0.030233,0.049017,0.053266,0.031397,0.026264,0.045146
H1 CLUB(구 덕평),0.083301,0.121103,1.0,0.0,0.0,0.106111,0.0,0.097707,0.120663,0.105919,...,0.082755,0.089859,0.05491,0.120163,0.043559,0.065348,0.053949,0.025347,0.037731,0.015382


In [24]:
# 가평베네스트와 유사한 골프장 5개 확인해보기
item_sim_df['가평베네스트'].sort_values(ascending=False)[1:6]

cc_name
파인크리크      0.196455
일동레이크      0.169238
사우스스프링스    0.084848
프리스틴밸리     0.083111
자유         0.082756
Name: 가평베네스트, dtype: float64

### 개인화 된 골프장 추천

In [25]:
# 평점 벡터(행 벡터)와 유사도 벡터(열 벡터)를 내적(dot)해서 예측 평점을 계산하는 함수 정의
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

ratings_pred = predict_rating(ratings_matrix.values , item_sim_df.values)

# 데이터프레임으로 변환
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)

In [26]:
print(ratings_pred_matrix.shape)
ratings_pred_matrix

(6108, 137)


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.126661,0.154646,0.097587,0.0,0.195213,0.135697,0.000000,0.000000,0.101452,0.117260,...,0.103588,0.123115,0.084206,0.136896,0.178283,0.134735,0.088635,0.113092,0.097834,0.180032
2,0.273082,0.290221,0.250303,0.0,0.118620,0.292067,0.000000,0.178344,0.295361,0.306964,...,0.206585,0.273211,0.187045,0.302079,0.146161,0.160645,0.177738,0.294799,0.094308,0.168462
3,0.120058,0.177105,0.154216,0.0,0.000000,0.145958,0.000000,0.000000,0.204272,1.419705,...,0.092665,0.175591,0.124618,0.189150,0.180221,0.150423,0.164096,0.169523,0.097216,0.126546
4,0.159005,0.094831,0.113135,0.0,0.000000,0.089186,0.000000,0.000000,0.083292,0.093975,...,0.092211,0.102997,0.088126,0.086223,0.012836,0.087934,0.125270,0.123201,0.048289,0.000000
5,0.043909,0.052111,0.055159,0.0,0.068170,0.057105,0.000000,0.000000,0.046626,0.041743,...,0.080176,0.088154,0.042571,0.047899,0.073701,0.043533,0.032281,0.025122,0.016950,0.014927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.101378,0.141091,0.130916,0.0,0.000000,0.143767,0.000000,0.113917,0.956408,0.146018,...,0.078835,0.118700,0.130237,0.103266,0.129292,0.098784,0.038930,0.161775,0.086516,0.031400
6105,0.138614,0.099928,0.097925,0.0,0.000000,0.098879,0.000000,0.000000,0.114625,0.074893,...,0.120507,0.121239,0.118218,0.085108,0.026938,0.032728,0.064504,0.124658,0.099912,0.088750
6106,0.138018,0.223201,0.189773,0.0,0.212653,0.234922,0.000000,0.000000,0.180819,0.134892,...,0.142438,0.218108,0.140986,0.277556,0.088055,0.134180,0.204248,0.143999,0.155821,0.138515
6107,0.223130,0.262984,0.177239,0.0,0.452754,0.263741,0.000000,0.172515,0.203204,0.198578,...,0.236791,0.248845,0.141680,0.227762,0.018429,0.178197,0.077038,0.166235,0.078323,0.219365


In [27]:
# 사용자가 평점을 부여한 골프장에 대해서만 예측 성능 평가 MSE 를 구함. 
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print('아이템 기반 모든 인접 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values))

아이템 기반 모든 인접 이웃 MSE:  51.40056886949116


### top-n 유사도를 가진 데이터들에 대해서만 예측 평점 계산
- MSE 값을 감소하기 위해 특정 골프장과 가장 비슷한 유사도를 가지는 골프장에 대해서만 유사도 벡터를 적용

In [28]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)

    # 사용자-아이템 평점 행렬의 열 크기만큼 Loop 수행. 
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 index 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T) 
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))        
    return pred

In [29]:
# 실행시간 2분 정도 걸림
ratings_pred = predict_rating_topsim(ratings_matrix.values , item_sim_df.values, n=20)
print('아이템 기반 인접 TOP-20 이웃 RMSE: ', get_mse(ratings_pred, ratings_matrix.values))

# 계산된 예측 평점 데이터는 DataFrame으로 재생성
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)
ratings_pred_matrix

  pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
  pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


아이템 기반 인접 TOP-20 이웃 RMSE:  29.962951426751125


cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.343310,0.375261,0.000000,0.0,0.198051,0.343177,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.346878,0.00000,0.300651,0.271328,0.268908,0.000000,0.000000,0.000000,0.292445
2,0.297899,0.328964,0.309845,0.0,0.120345,0.314239,0.000000,0.187866,0.617075,0.601112,...,0.000000,0.315593,0.00000,0.257814,0.000000,0.000000,0.000000,0.237027,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,3.426837,...,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.205563
4,0.430978,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.237007,0.265606,0.000000,0.000000
5,0.000000,0.000000,0.150390,0.0,0.069161,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.208033,0.248375,0.00000,0.000000,0.112166,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,0.000000,0.342369,0.356938,0.0,0.000000,0.363586,0.000000,0.119999,2.519162,0.352454,...,0.000000,0.000000,0.27717,0.000000,0.000000,0.000000,0.000000,0.348766,0.000000,0.000000
6105,0.375710,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.312678,0.341590,0.00000,0.000000,0.000000,0.000000,0.000000,0.268747,0.000000,0.000000
6106,0.000000,0.304950,0.293837,0.0,0.215744,0.368315,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.362590,0.00000,0.609566,0.000000,0.000000,0.242043,0.000000,0.197772,0.000000
6107,0.405289,0.358913,0.297570,0.0,0.328513,0.666999,0.000000,0.181726,0.372063,0.386247,...,0.369212,0.701121,0.00000,0.308389,0.000000,0.206755,0.000000,0.000000,0.000000,0.000000


In [45]:
# 추천에 앞서 2번 사용자가 높은 평점을 준 골프장을 확인
user_rating_id = ratings_matrix.loc[2, :]
user_rating_id[ user_rating_id > 0].sort_values(ascending=False)[:5]

cc_name
아리지    10.0
금강      8.0
그린힐     7.5
Name: 2, dtype: float64

#### 사용자가 방문하지 않은 골프장 중에서 추천해보자
- user_rating이 0보다 크면 기존에 방문한 골프장이라는 점을 이용해서 계산

In [46]:
def get_unseen_cc(ratings_matrix, id_num):
    # id_num으로 입력받은 사용자의 모든 골프장 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 골프장명(cc_name)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[id_num,:]
    
    # user_rating이 0보다 크면 기존에 방문한 골프장. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 골프장을 list 객체로 만듬.
    cc_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 cc는 cc_list에서 제외함. 
    unseen_list = [cc for cc in cc_list if cc not in already_seen]
    
    return unseen_list

In [47]:
# pred_df : 앞서 계산된 골프장별 예측 평점
# unseen_list : 사용자가 방문하지 않은 골프장
# top_n : 상위 n개를 가져온다.

def recomm_cc_by_id(pred_df, id_num, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 골프장명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_cc = pred_df.loc[id_num, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_cc

In [49]:
# 사용자가 방문하지 않은 골프장 추출   
unseen_list = get_unseen_cc(ratings_matrix, 2)

# 아이템 기반의 인접 이웃 협업 필터링으로 골프장 추천 
recomm_movies = recomm_cc_by_id(ratings_pred_matrix, 2, unseen_list, top_n=10)

# 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
안성,0.795842
여주,0.759187
에덴블루,0.726302
캐슬파인,0.707528
루나힐스안성,0.692187
타이거,0.650705
큐로,0.647471
리베라,0.64006
골드,0.617075
골프존카운티안성H,0.601112


## CF - MF

In [None]:
# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = ratings_cc_df.pivot_table('cc_score', index='id_num', columns='cc_num')
rating_cc_info = pd.merge(ratings_cc_df, data_cc, on='cc_num')
ratings_matrix = rating_cc_info.pivot_table('cc_score', index='id_num', columns='cc_name')
ratings_matrix

print(ratings_matrix.shape)
ratings_matrix.head()

In [None]:
R = ratings_matrix.to_numpy()
R

In [None]:
# shape 행,열 두 개 변수에 한 줄로 할당하기
num_users, num_items = R.shape
print(num_users, num_items)

In [None]:
# 잠재요인 factor 개수
K=50
# P와Q 매트릭스의 크기를 지정하고 정규분포를 가진 random한 값으로 P,Q행렬 생성
# 난수 시드 생성
np.random.seed(3)
# P행렬 : 사용자 - 잠재요인 행렬
P = np.random.normal(scale=1./K, size=(num_users, K))
# Q행렬 : 아이템 - 잠재요인 행렬(실제 분해하게되면 Q의 Transpose행렬로 됨!)
Q = np.random.normal(scale=1./K, size=(num_items, K))
print('P:', P)
print('Q:', Q)

In [None]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    # 두개의 분해된 행렬 P와 Q의 전치행렬 냐적으로 예측행렬 R' 생성
    R_ = np.dot(P, Q.T)
    
    # 실제 R행렬에서 NaN값이 아닌값들의 인덱스 위치와 값을 추출해서
    # 실제 R행렬과 예측 R'행렬 간의 RMSE비교
    x_non_zero_idx = [non_zero[0] for non_zero in non_zeros] #행
    y_non_zero_idx = [non_zero[1] for non_zero in non_zeros] #열
    Rnon_zeros = R[x_non_zero_idx, y_non_zero_idx] #실제 R행렬의 NaN아닌 값들
    R_non_zeros = R_[x_non_zero_idx, y_non_zero_idx] #예측 R'행렬에서 똑같은 위치의 값들을 뽑아내기
    # 1차원의 array 2개 값들을 각각 비교
    mse = mean_squared_error(Rnon_zeros, R_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [None]:
# 위 함수에서 non_zeros에 해당하는 값들 리스트에 저장
non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

steps = 200
learning_rate = 0.01
r_lambda = 0.01

# Stochastic Gradient Descent 방법으로 P와 Q 매트릭스를 계속 업데이트
for step in range(steps):
    for i, j, v in non_zeros:
        #실제 R행렬의 특정값과 예측행렬 R'의 똑같은 위치의 특정값의 차이(오류)를 구하기
        eij = v - np.dot(P[i, :], Q[j, :].T)
        # 정규화를 반영한 SGD 업데이트 공식 적용
        P[i, :] = P[i, :] + learning_rate*(eij*Q[j, :] - r_lambda*P[i,:])
        Q[j, :] = Q[j, :] + learning_rate*(eij*P[i, :] - r_lambda*Q[j, :])
    # 1번 step돌때마다 예측행렬 R'의 특정 인덱스(실제행렬 R에서 NaN값이 아닌 위치인덱스들)의 값들 업데이트
    # get_rmse함수는 실제행렬 R에서 NaN값이 아닌 값들과 예측행렬 R'의 특정 인덱스 값들만을 비교
    # 단, 예측행렬 R'의 모든 요소값들은 업데이트 되었음. RMSE값을 도출하기 위해서 특정 위치의 값들만 비교를 한 것임!
    rmse = get_rmse(R, P, Q, non_zeros)
    # 50번 step수행할 때마다 출력하기, %는 나머지 값
    if (step % 50) == 0:
        print('### iteration step: ', step, "RMSE: ", rmse)

In [None]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [None]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # P와 Q 매트릭스를 계속 업데이트(확률적 경사하강법)
    for step in tqdm(range(steps)):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [None]:
%%time
# 경사하강법을 이용한 행렬 분해(4~5분 정도 걸림)
P, Q = matrix_factorization(ratings_matrix.values, K=5, steps=200, learning_rate=0.01, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)