In [3]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_squared_error

In [1]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    full_pred_matrix = np.dot(P, Q.T)
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    return rmse

In [37]:
def matrix_factorization(R, K, steps, learning_rate, r_lambda):
    #R : 원본 사용자-아이템 평점 행렬
    #K : 잠재요인의 차원 수
    #step : SGD의 반복 횟수
    num_users, num_items = R.shape
    np.random.seed(1)
    P = np.random.normal(scale = 1./K, size = (num_users, K))
    Q = np.random.normal(scale = 1./K, size = (num_items, K))
    
    prev_rmse = 10000
    break_count = 0
    
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
     
    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i, :], Q[j, :].T)
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
        rmse = get_rmse(R, P, Q, non_zeros)
        if(step % 10) == 0:
            print("### iteration step : ",step, " rmse : ", rmse)
    return P,Q

In [38]:
data = pd.read_csv('data_fin_.csv',encoding = 'cp949')
data.head()

Unnamed: 0,userid,courseid,course,rating,강원도,경상북도,경상남도,전라북도,전라남도,충청도,...,나무,유적지,맛집여행,호캉스,감성카페,애견동반ㅇ,봄,여름,가을,겨울
0,1,1,"유진게찜,엘꼬시네로(스페인음식)",10,1,0,0,0,0,0,...,0,0,1,0,0.0,0,1,1,1,1
1,1,2,"동명정류장카페,바다를본돼지",10,0,0,0,0,0,0,...,0,0,1,0,1.0,0,1,1,1,1
2,1,3,"몽상드애월카페,아기해녀의집",10,0,0,0,0,0,0,...,0,0,1,0,1.0,0,1,1,1,1
3,1,4,대전운주계곡,8,0,0,0,0,0,0,...,1,0,0,0,0.0,0,1,1,1,1
4,5,5,"양양쏠비치송이(한식),호텔내미술관",10,1,0,0,0,0,0,...,0,0,0,0,0.0,0,1,1,1,1


In [39]:
rating = data[['userid', 'courseid', 'rating']]
rating

Unnamed: 0,userid,courseid,rating
0,1,1,10
1,1,2,10
2,1,3,10
3,1,4,8
4,5,5,10
...,...,...,...
349,7,324,10
350,2,325,8
351,1,325,10
352,1,326,10


In [40]:
rating_matrix = rating.pivot_table('rating', index = 'userid', columns = 'courseid')
rating_matrix = data.pivot_table('rating', index = 'userid', columns = 'course')

In [42]:
P, Q = matrix_factorization(rating_matrix.values, 50, 200, 0.01, 0.01)
pred_matrix = np.dot(P, Q.T)

### iteration step :  0  rmse :  8.549574857003222
### iteration step :  10  rmse :  1.4479801487544655
### iteration step :  20  rmse :  0.26055439074569287
### iteration step :  30  rmse :  0.17407374866918907
### iteration step :  40  rmse :  0.12310513985995265
### iteration step :  50  rmse :  0.09062536917564554
### iteration step :  60  rmse :  0.06882385362691304
### iteration step :  70  rmse :  0.053820607711668696
### iteration step :  80  rmse :  0.04392177069464552
### iteration step :  90  rmse :  0.03789566917690585
### iteration step :  100  rmse :  0.034470399725993806
### iteration step :  110  rmse :  0.032551535006767766
### iteration step :  120  rmse :  0.03141171768256271
### iteration step :  130  rmse :  0.03065127018914714
### iteration step :  140  rmse :  0.030074156443192862
### iteration step :  150  rmse :  0.029589495699793977
### iteration step :  160  rmse :  0.029155884823516994
### iteration step :  170  rmse :  0.028754245442735644
### iteration ste

In [43]:
rating_pred_matrix = pd.DataFrame(data = pred_matrix, index = rating_matrix.index, columns = rating_matrix.columns)
rating_pred_matrix.head(3)

course,"DDP,광장시장",가평선어치고개집,"가평스위스테마파크,달과6펜스,금강산숯불닭갈비","가평아침고요수목원,남이섬,제이드가든","강릉강문해변,대관령순수양떼목장","강릉강문해변,오월에초당,오리카페,해두리치킨","강릉경포대,경포호,경포해변","강릉남애항스카이워크,둔내역한우","강릉빵다방,돌체테리아,초당소나무집순두부젤라또,해미가,삼교리동치미막국수",강릉세인트존스호텔,...,"하동매암제다원,매암차박물관,토지촬영지","하동지리산,십리벚꽃길,화개장터,섬진강","한라산둘레길사려니숲길,비자림국숫집,천년의숲","해운대,더베이101,송도거북섬,오륙도스카이워크,유엔기념공원,해운대아쿠아리움,센텀시티아쿠아리움","해운대감천달빛도너츠,소행성B612","해운대다솥(한식),카페루프탑,로컬102(펍),라라랜드","해운대오륙도,해운대해수욕장,동백섬,더베이101,못골시장,이가네떡볶이","해운대이기대해안산책로,풍원장","해운대포장마차촌,전통시장","홍성남당항,홍성군조류탐사과학관,그림이있는정원"
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.961533,5.04111,4.292514,7.974104,4.377097,3.259789,4.475652,5.448745,4.462763,5.399635,...,5.031452,5.525775,5.095502,4.799038,4.259038,4.937342,4.822591,3.820609,4.829403,4.341238
2,3.879213,4.957055,3.770301,4.413302,3.435166,2.646325,3.525864,4.294427,3.565343,4.408659,...,3.971131,4.48565,4.074494,4.571089,3.540818,4.489398,4.039032,3.097382,3.856686,3.490197
3,8.96405,5.599809,4.246522,4.712993,7.968041,5.976159,7.968654,9.961013,7.968633,9.96095,...,8.974233,9.971367,8.97401,5.191935,7.97688,5.482242,8.974186,6.979906,8.974545,7.977372


In [44]:
def get_unseen_course(rating_matrix, userid): #이미 rating 준 코스 제외하고 추천할 수 있도록, rating 안준 코스 반환 함수
    user_rating = rating_matrix.loc[userid, :] #반환된 user_rating은 코스를 index로 가지는 Series 객체
    already_go = user_rating[ user_rating > 0 ].index.tolist()
    course_list = rating_matrix.columns.tolist()
    un_list = [ course for course in course_list if course not in already_go ]
    return un_list

In [45]:
def recomm_course_by_userid(pred_df, userid, un_list, top_n):
    # unseen course 코스를 추출해 가장 예측 평점 높은 순으로 정렬
    recomm_course = pred_df.loc[userid, un_list].sort_values(ascending = False)[: top_n]
    return recomm_course

In [46]:
un_list = get_unseen_course(rating_matrix, 7) #id 7이 다녀오지 않은 코스 추출

In [47]:
recomm_course = recomm_course_by_userid(rating_pred_matrix, 7, un_list, 10) #최근접 이웃 협엄 필터링으로 10개 코스 추천

In [48]:
recomm_course = pd.DataFrame(data = recomm_course.values, index = recomm_course.index, columns = ['pred_score'])
recomm_course

Unnamed: 0_level_0,pred_score
course,Unnamed: 1_level_1
양양쏠비치,7.870159
"서울신라호텔,더파크뷰,어반아일랜드",7.402376
"제주원앤온리카페,산방산,사계생활,뷰스트,송악산둘레길,잇뽕사계",7.300132
"제주애월백번가든,협재해수욕장,애월더선셋,김녕미로공원,만장굴,코코마마",7.187423
"잠실롯데시그니엘호텔,롯데월드",6.86139
"유진게찜,엘꼬시네로(스페인음식)",6.589973
"강릉오죽헌,경포생태저류지,선교장,경포대허브관광농원,초당순두부마을,강문해변,오리카페",6.588402
"동명정류장카페,바다를본돼지",6.540317
"제주도성산일출봉,광치기해변,대영수산직영회센타,하도1940",6.49751
"의성삼미마늘닭,사촌가루숲,의성의병기념관,제격문,빙계계곡,빙혈,빙산사지오층석탑,빙계사원",6.486432
