## CF - MF기반 잠재요인 CF

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error

In [2]:
# 전처리 완료된 data 불러오기 
path = './data/'
ratings_cc = pd.read_csv(os.path.join(path, 'overall_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_caddie = pd.read_csv(os.path.join(path, 'caddie_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_course = pd.read_csv(os.path.join(path, 'course_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_price = pd.read_csv(os.path.join(path, 'price_rating_completed.csv'), encoding='utf-8', index_col=0)
ratings_facility = pd.read_csv(os.path.join(path, 'facility_rating_completed.csv'), encoding='utf-8', index_col=0)

In [3]:
# 데이터 확인
ratings_cc.info()
# null 체크
ratings_cc.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24767 entries, 0 to 24766
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id_num    24767 non-null  int64  
 1   cc_num    24767 non-null  int64  
 2   cc_score  24767 non-null  float64
 3   cc_name   24767 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 967.5+ KB


id_num      0
cc_num      0
cc_score    0
cc_name     0
dtype: int64

In [4]:
# pivot_table 사용해서 행렬 변환(중복 평점 시 최대값 적용)
ratings_cc_matrix = ratings_cc.pivot_table('cc_score', index='id_num', columns='cc_name', aggfunc='max')
ratings_caddie_matrix = ratings_caddie.pivot_table('caddie', index='id_num', columns='cc_name', aggfunc='max')
ratings_course_matrix = ratings_course.pivot_table('course', index='id_num', columns='cc_name', aggfunc='max')
ratings_price_matrix = ratings_price.pivot_table('price', index='id_num', columns='cc_name', aggfunc='max')
ratings_facility_matrix = ratings_facility.pivot_table('facility', index='id_num', columns='cc_name', aggfunc='max')

In [5]:
# 카테고리 별 반복 작업을 위한 변수 지정 
feature_list = ['cc', 'caddie', 'course', 'price', 'facility']
matrices_list = ['ratings_'+ feature_list[i] + '_matrix' for i in range(len(feature_list))]
pred_matrices_list = ['ratings_pred_'+ feature_list[i] + '_matrix' for i in range(len(feature_list))]

In [6]:
# RMSE 확인 함수 생성
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]

    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [7]:
# MF 함수 생성
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # P와 Q 매트릭스를 계속 업데이트(확률적 경사하강법)
    for step in tqdm(range(steps)):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [8]:
%%time
# 경사하강법을 이용한 행렬 분해
P1, Q1 = matrix_factorization(ratings_cc_matrix.values, K=5, steps=100, learning_rate=0.001, r_lambda = 0.01)
P2, Q2 = matrix_factorization(ratings_caddie_matrix.values, K=5, steps=100, learning_rate=0.001, r_lambda = 0.01)
P3, Q3 = matrix_factorization(ratings_course_matrix.values, K=5, steps=100, learning_rate=0.001, r_lambda = 0.01)
P4, Q4 = matrix_factorization(ratings_price_matrix.values, K=5, steps=100, learning_rate=0.001, r_lambda = 0.01)
P5, Q5 = matrix_factorization(ratings_facility_matrix.values, K=5, steps=100, learning_rate=0.001, r_lambda = 0.01)

pred_cc_matrix = np.dot(P1, Q1.T)
pred_caddie_matrix = np.dot(P2, Q2.T)
pred_course_matrix = np.dot(P3, Q3.T)
pred_price_matrix = np.dot(P4, Q4.T)
pred_facility_matrix = np.dot(P5, Q5.T)

  0%|          | 0/100 [00:00<?, ?it/s]

### iteration step :  0  rmse :  8.939813996345885
### iteration step :  10  rmse :  7.119995786369249
### iteration step :  20  rmse :  3.127398066394236
### iteration step :  30  rmse :  1.90464958808673
### iteration step :  40  rmse :  1.4133825478427142
### iteration step :  50  rmse :  1.1832729235907165
### iteration step :  60  rmse :  1.060731600723186
### iteration step :  70  rmse :  0.9857615061270414
### iteration step :  80  rmse :  0.9334903726197252
### iteration step :  90  rmse :  0.8936600641579105


  0%|          | 0/100 [00:00<?, ?it/s]

### iteration step :  0  rmse :  9.234470180895872
### iteration step :  10  rmse :  7.178454712984481
### iteration step :  20  rmse :  3.182252484709174
### iteration step :  30  rmse :  1.9971265071584026
### iteration step :  40  rmse :  1.5341809660593
### iteration step :  50  rmse :  1.3205116496186775
### iteration step :  60  rmse :  1.2049077563609114
### iteration step :  70  rmse :  1.131285861046412
### iteration step :  80  rmse :  1.077936349182044
### iteration step :  90  rmse :  1.036205669675044


  0%|          | 0/100 [00:00<?, ?it/s]

### iteration step :  0  rmse :  9.002417332591492
### iteration step :  10  rmse :  7.110415194673168
### iteration step :  20  rmse :  3.2181351337241226
### iteration step :  30  rmse :  2.054437690759688
### iteration step :  40  rmse :  1.5896382471773072
### iteration step :  50  rmse :  1.3643309336989986
### iteration step :  60  rmse :  1.2367259011665388
### iteration step :  70  rmse :  1.154176486140704
### iteration step :  80  rmse :  1.0946944630230222
### iteration step :  90  rmse :  1.0487325963227598


  0%|          | 0/100 [00:00<?, ?it/s]

### iteration step :  0  rmse :  9.01686940012142
### iteration step :  10  rmse :  7.214407702620053
### iteration step :  20  rmse :  3.240485891404246
### iteration step :  30  rmse :  2.0399385302747475
### iteration step :  40  rmse :  1.573288230502966
### iteration step :  50  rmse :  1.3561753294080008
### iteration step :  60  rmse :  1.2373572902670447
### iteration step :  70  rmse :  1.161356988205203
### iteration step :  80  rmse :  1.1063829951206647
### iteration step :  90  rmse :  1.063635429531065


  0%|          | 0/100 [00:00<?, ?it/s]

### iteration step :  0  rmse :  8.841487884193684
### iteration step :  10  rmse :  7.176105664826889
### iteration step :  20  rmse :  3.2507144083876356
### iteration step :  30  rmse :  2.0693671492472596
### iteration step :  40  rmse :  1.6017956734841696
### iteration step :  50  rmse :  1.3817892815357893
### iteration step :  60  rmse :  1.2607895673896705
### iteration step :  70  rmse :  1.1834481938463952
### iteration step :  80  rmse :  1.12760241592906
### iteration step :  90  rmse :  1.084113043381681
Wall time: 2min 10s


In [9]:
# 카테고리 별 예측치 데이터프레임 생성
ratings_pred_cc_matrix = pd.DataFrame(data=pred_cc_matrix, index= ratings_cc_matrix.index, columns = ratings_cc_matrix.columns)
ratings_pred_caddie_matrix = pd.DataFrame(data=pred_caddie_matrix, index= ratings_caddie_matrix.index, columns = ratings_caddie_matrix.columns)
ratings_pred_course_matrix = pd.DataFrame(data=pred_course_matrix, index= ratings_course_matrix.index, columns = ratings_course_matrix.columns)
ratings_pred_price_matrix = pd.DataFrame(data=pred_price_matrix, index= ratings_price_matrix.index, columns = ratings_price_matrix.columns)
ratings_pred_facility_matrix = pd.DataFrame(data=pred_facility_matrix, index= ratings_facility_matrix.index, columns = ratings_facility_matrix.columns)

In [10]:
# 데이터프레임 하나 확인
ratings_pred_cc_matrix

cc_name,360도,88,H1 CLUB(구 덕평),가산노블리스,가평베네스트,강남300,고양컨트리클럽(P9),곤지암,골드,골프존카운티안성H,...,필로스,한림광릉,한림용인,한성,한양,한양파인(P9),한원,해솔리아,화성(P9),화성상록
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,9.845611,10.091221,9.520478,1.279492,1.442179,9.398067,1.288677,1.761905,9.611915,9.955155,...,9.488854,8.903050,8.380930,9.950996,8.173908,8.966549,9.434977,8.480461,9.105652,8.179935
2,8.780758,7.685725,6.873324,1.011981,1.355264,6.556269,1.171063,1.647214,7.148317,7.757074,...,6.822239,8.723217,6.827587,7.623895,6.628153,7.526558,7.514111,6.779659,6.930097,6.293206
3,8.977103,8.251648,9.052212,1.177610,1.652467,8.066685,1.098823,1.820822,10.070977,9.238317,...,9.234105,9.086352,7.647756,9.177111,7.003407,8.116991,8.461995,8.167034,8.233741,7.728338
4,9.624695,8.546612,8.201016,1.070403,1.598449,7.252983,1.257574,1.905325,8.523201,8.917147,...,8.083453,9.889289,7.798110,8.833591,7.324837,8.457893,8.539474,7.699499,8.113007,7.352678
5,2.515471,3.597230,3.330673,0.064284,0.292767,0.784729,0.134717,0.519091,1.297226,3.144224,...,2.435692,2.626282,2.962867,3.447949,2.593152,3.185839,3.125834,2.507841,3.315349,2.588106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6104,8.874357,8.666466,8.865624,1.526310,1.350217,9.058599,0.953475,1.445241,9.885306,9.195891,...,9.353288,7.382921,7.225391,9.046125,7.241418,7.978831,8.241716,8.822349,7.459064,7.354969
6105,5.921999,6.324415,6.762813,0.189794,1.113758,2.056221,0.536216,1.476807,4.528801,6.560825,...,5.465534,7.418932,6.172151,6.958431,5.017665,6.493314,6.397019,5.212540,6.819922,5.622100
6106,8.274989,7.713225,6.673016,1.229552,1.086132,6.962151,0.966934,1.300023,6.865402,7.526691,...,6.818543,7.053237,6.295258,7.360547,6.535650,7.169214,7.099868,7.140601,6.163272,5.865649
6107,7.683461,7.882538,7.759459,1.209679,1.043982,5.663897,0.525135,1.199992,6.870732,8.046649,...,7.496858,6.454168,6.538603,8.089632,6.500065,7.536502,7.256948,8.164850,6.363461,6.219051


### 6번 사용자가 평가하지 않은(미방문) 골프장 중에서 추천해보자
- user_rating이 0보다 크면 기존에 방문한 골프장이라는 점을 이용해서 계산

In [11]:
def get_unseen_cc(ratings_matrix, id_num):
    # id_num으로 입력받은 사용자의 모든 골프장 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 골프장명(cc_name)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[id_num,:]
    
    # user_rating이 0보다 크면 기존에 방문한 골프장. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 골프장을 list 객체로 만듬.
    cc_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 cc는 cc_list에서 제외함. 
    unseen_list = [cc for cc in cc_list if cc not in already_seen]
    
    return unseen_list

In [12]:
# pred_df : 앞서 계산된 골프장별 예측 평점
# unseen_list : 사용자가 방문하지 않은 골프장
# top_n : 상위 n개를 가져온다.

def recomm_cc_by_id(pred_df, id_num, unseen_list, top_n=5):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 골프장명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_cc = pred_df.loc[id_num, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_cc

In [13]:
# userId의 카테고리별 예측 평점
def recommendation_golf(userId):
    for i in range(len(matrices_list)):
        cc_list = get_unseen_cc(globals()[matrices_list[i]], userId)
        recomm_10_cc = recomm_cc_by_id(globals()[pred_matrices_list[i]], userId, cc_list, top_n=10)
        recomm_10_cc = pd.DataFrame(data=recomm_10_cc.values, index=recomm_10_cc.index, columns=['pred_score'])
        print(pred_matrices_list[i])
        display(recomm_10_cc)

In [14]:
recommendation_golf(6)

ratings_pred_cc_matrix


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
골프클럽Q(구 안성Q),9.243886
금강,9.184598
88,9.111397
마에스트로,9.073253
포천아도니스,8.993815
한성,8.962297
자유,8.932702
뉴코리아,8.919423
그린힐,8.917952
페럼,8.90835


ratings_pred_caddie_matrix


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
코리아,9.474305
타이거,9.415857
사우스스프링스,9.323512
남서울,9.264853
여주,9.240304
자유,9.212473
뉴코리아,9.204668
레이크우드,9.095924
글렌로스(P9),9.08753
서서울,9.050471


ratings_pred_course_matrix


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
한성,9.274439
골프클럽Q(구 안성Q),9.126283
서서울,9.086693
글렌로스(P9),9.057952
뉴코리아,9.056809
그린힐,8.944254
뉴스프링빌,8.932742
페럼,8.918484
마에스트로,8.902285
청평마이다스,8.893285


ratings_pred_price_matrix


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
비에이비스타퍼블릭,11.212793
88,10.962698
금강,10.558849
그린힐,10.451415
몽베르,10.364889
신안(P9),10.270359
뉴스프링빌,10.214402
골프클럽Q(구 안성Q),10.072628
포레스트힐(P6),10.044076
캐슬파인,9.947545


ratings_pred_facility_matrix


Unnamed: 0_level_0,pred_score
cc_name,Unnamed: 1_level_1
레이크사이드,9.063321
자유,8.969618
뉴코리아,8.905545
금강,8.890031
마에스트로,8.880417
골프클럽Q(구 안성Q),8.876514
청평마이다스,8.829088
페럼,8.805799
푸른솔포천,8.778024
서서울,8.746014


- 변별력 있게 추천해 줌, 초과 예측 평점 존재 