In [1]:
import os
import os.path as path
import gc
import re
import math
import json
import random

In [2]:
from datetime import datetime
import time

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_row', 50)
pd.set_option('display.max_columns', 100)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# 디렉토리 기본 경로 설정
DIR_PATH = path.join('..', 'data', 'sql_dummy')
DIR_SAVE_PATH = path.join('..', 'data', 'output')

print(DIR_PATH)
print(DIR_SAVE_PATH)

..\data\sql_dummy
..\data\output


In [6]:
# sql_bean.csv
bean_read = pd.read_csv(path.join(DIR_PATH, 'sql_bean.csv'), low_memory=False, encoding='cp949')

print(bean_read.shape)
bean_read.head()

(47, 8)


Unnamed: 0,idx,created_date,updated_date,name_ko,name_en,summary,thumbnail,user_grade
0,1,,,에티오피아 예가체프 G2 워시드 (중배전),에티오피아 예가체프 G2 워시드 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0
1,2,,,과테말라 안티구아 SHB 워시드 (강배전),과테말라 안티구아 SHB 워시드 (강배전),아이템 요약 영역 입니다!!!,default_bean.png,0
2,3,,,에티오피아 코케 G1 펄프드내추럴 (중배전),에티오피아 코케 G1 펄프드내추럴 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0
3,4,,,케냐 키암부 AA 워시드 (중배전),케냐 키암부 AA 워시드 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0
4,5,,,콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0


In [7]:
# sql_review.csv
review_read = pd.read_csv(path.join(DIR_PATH, 'sql_review.csv'), low_memory=False, encoding='cp949')

print(review_read.shape)
review_read.head()

(9459, 16)


Unnamed: 0,idx,created_date,updated_date,acidity,bitterness,body,coffeeing_note,content,expired,flavor,item_idx,item_type,like,overall,sweetness,member_idx
0,1,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,7,6,8
1,2,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,11
2,3,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,5,6,29
3,4,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,37
4,5,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,38


In [8]:
review_df = pd.merge(review_read, bean_read[['idx', 'name_ko']], left_on='item_idx', right_on='idx')
print(review_df.shape)
review_df.head()

(9459, 18)


Unnamed: 0,idx_x,created_date,updated_date,acidity,bitterness,body,coffeeing_note,content,expired,flavor,item_idx,item_type,like,overall,sweetness,member_idx,idx_y,name_ko
0,1,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,7,6,8,1,에티오피아 예가체프 G2 워시드 (중배전)
1,2,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,11,1,에티오피아 예가체프 G2 워시드 (중배전)
2,3,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,5,6,29,1,에티오피아 예가체프 G2 워시드 (중배전)
3,4,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,37,1,에티오피아 예가체프 G2 워시드 (중배전)
4,5,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,38,1,에티오피아 예가체프 G2 워시드 (중배전)


In [9]:
review_df = review_df[['item_idx', 'member_idx', 'name_ko', 'overall']]
review_df = review_df.pivot(index=['member_idx'], columns=['item_idx', 'name_ko'], values='overall')
review_df.fillna(0, inplace=True)
review_df = review_df / 10

In [10]:
#item_base로 변환
review_df = review_df.T
review_df.head()

Unnamed: 0_level_0,member_idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,...,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000
item_idx,name_ko,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1
1,에티오피아 예가체프 G2 워시드 (중배전),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.8,0.0,0.0,0.0,0.0,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.8,0.0,0.9,1.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,1.0,0.0
2,과테말라 안티구아 SHB 워시드 (강배전),0.0,0.0,0.0,0.0,0.7,0.0,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.6,0.0,0.0,0.0,0.0,0.9,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.5,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,0.0,0.6,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,에티오피아 코케 G1 펄프드내추럴 (중배전),0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.3,0.0,0.0,0.0,0.0,0.3,0.0,0.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,1.0,...,0.0,0.0,0.7,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.6,0.0,0.7,0.6,0.0,0.0,0.9,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.7,0.7,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.8,0.8,0.0,0.0,0.4,0.0
4,케냐 키암부 AA 워시드 (중배전),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.7,0.0,0.6,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.4,0.0,0.0,0.2,0.0,0.1,0.0,0.1,0.7,0.0,0.9,0.0,0.8,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.8,1.0,0.0,0.0,0.0,0.0,0.9,0.0,0.0,0.0,0.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.9,0.3,0.0,0.0,1.0,0.0,0.2,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.5,0.9,0.0,0.0,0.9,0.0,0.0,0.0,0.0,0.7,0.5,0.0,0.0
5,콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),0.0,0.0,0.0,0.0,0.3,0.0,0.0,0.7,0.2,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.9,0.0,0.0,0.1,0.0,...,0.4,0.0,0.2,0.0,0.6,0.0,0.8,0.0,0.0,0.5,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.7


In [11]:
%%time
review_cosine_sim = cosine_similarity(review_df)

CPU times: total: 0 ns
Wall time: 996 µs


In [12]:
print(review_cosine_sim.shape)
print(review_cosine_sim.dtype)

review_cosine_sim = review_cosine_sim.astype(np.float16)
gc.collect()
print(review_cosine_sim.dtype)

review_cosine_sim

(47, 47)
float64
float16


array([[1.    , 0.1575, 0.2217, ..., 0.2185, 0.1759, 0.1603],
       [0.1575, 1.    , 0.1973, ..., 0.2128, 0.1676, 0.1913],
       [0.2217, 0.1973, 1.    , ..., 0.1893, 0.1161, 0.1475],
       ...,
       [0.2185, 0.2128, 0.1893, ..., 1.    , 0.2004, 0.1764],
       [0.1759, 0.1676, 0.1161, ..., 0.2004, 1.    , 0.15  ],
       [0.1603, 0.1913, 0.1475, ..., 0.1764, 0.15  , 1.    ]],
      dtype=float16)

In [13]:
# 제목 검색 뿐만 아니라 id로도 검색할 수 있도록 행과 열 중 하나를 name_ko, 다른 하나를 item_idx로 지정
# multi index의 droplevel을 사용해서 index를 분리 (title 기준, movieId 기준으로 총 2개로 분리)
review_cosine_sim_df = pd.DataFrame(review_cosine_sim, index = review_df.droplevel(1).index, columns = review_df.droplevel(0).index, dtype=np.float16)

In [14]:
print(review_cosine_sim_df.shape)
review_cosine_sim_df.head()

(47, 47)


name_ko,에티오피아 예가체프 G2 워시드 (중배전),과테말라 안티구아 SHB 워시드 (강배전),에티오피아 코케 G1 펄프드내추럴 (중배전),케냐 키암부 AA 워시드 (중배전),콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),페루 피우라 MICRO LOT 워시드 (중배전),브라질 세하도 NY2 FC 내추럴 (중배전),케냐 키암부 FAQ 워시드 (중배전),파나마 보케테 SPECIALTY 워시드 (중배전),브라질 세하도 NY2 FC 펄프드내추럴 (중배전),브라질 세하도 SPECIALTY 펄프드내추럴 (중배전),코스타리카 나란조 MICRO LOT 워시드 (중배전),온두라스 엘 파라이소 SPECIALTY 워시드 (중배전),에티오피아 시다모 SPECIALTY 워시드 (중배전),과테말라 우에우에테낭고 SHB 워시드 디카페인 (강배전),콜롬비아 후일라 EXCELOS 워시드 디카페인 (강배전),인도 말라바르 AA 워시드 (중배전),코스타리카 따라주 SHB 워시드 (중배전),르완다 부산제 SPECIALTY 워시드 (중배전),브라질 카르모 데 미나스 - 워시드 디카페인 (중배전),엘살바도르 산타아 MICRO LOT 워시드 (중배전),인도네시아 수마트라 G1 웻헐드 (중배전),베트남 베트남 G1 워시드 (중배전),에티오피아 시다모 G2 워시드 (중배전),인도 치크마갈루르 AA 워시드 (중배전),에티오피아 시다모 G2 워시드 디카페인 (중배전),콜롬비아 콜롬비아 SUPREMO 워시드 디카페인 (강배전),에티오피아 예가체프 G2 워시드 (약배전),브라질 세하도 NY2 FC 네추럴 (중배전),과테말라 안티구아 SHB 워시드 (중배전),콜롬비아 후일라 SUPREMO 워시드 (중배전),베트남 다낭 G1 폴리싱 (강배전),브라질 산토스 NY2 FC 내추럴 (중배전),에티오피아 시다모 G4 내추럴 (중배전),에티오피아 예가체프 G4 내추럴 (중배전),탄자니아 음빙가 AA 워시드 (강배전),콜롬비아 메데인 SUPREMO 워시드 (강배전),코스타리카 따라주 SHB 워시드 (강배전),파푸아뉴기니 와기벨리 AA 워시드 (강배전),인도네시아 만델링 G1 워시드 (강배전),온두라스 산티아고 푸링글라 SHB 워시드 (강배전),엘 살바도르 엘 살바도르 SHG 워시드 (강배전),케냐 키암부 AA 워시드 (강배전),콜롬비아 메데인 SUPREMO 워시드 디카페인 (강배전),과테말라 안티구아 SHB 워시드 디카페인 (강배전),에티오피아 예가체프 G2 워시드 디카페인 (강배전),브라질 산토스 NY2 FC 워시드 디카페인 (강배전)
item_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
1,1.0,0.157471,0.22168,0.151855,0.189087,0.200684,0.204102,0.194458,0.182739,0.157715,0.220825,0.200195,0.165161,0.157715,0.1698,0.147827,0.197876,0.143555,0.196167,0.171387,0.174072,0.160034,0.196533,0.157959,0.157227,0.157593,0.192993,0.151733,0.22168,0.138794,0.14917,0.156372,0.16333,0.185913,0.145386,0.19873,0.164673,0.154419,0.178345,0.181396,0.164795,0.155518,0.135986,0.199341,0.218506,0.175903,0.160278
2,0.157471,1.0,0.197266,0.178833,0.196167,0.172974,0.163818,0.125244,0.171021,0.196045,0.162231,0.173706,0.162109,0.229492,0.177246,0.160889,0.1427,0.165649,0.156494,0.19043,0.169556,0.0849,0.140259,0.151489,0.196533,0.193359,0.173096,0.196045,0.145142,0.126953,0.126831,0.180298,0.15271,0.176392,0.14502,0.146606,0.12854,0.155273,0.153442,0.19397,0.162109,0.207153,0.166626,0.245728,0.212769,0.167603,0.191284
3,0.22168,0.197266,1.0,0.167358,0.183594,0.147095,0.186035,0.179321,0.135742,0.133057,0.206543,0.161499,0.184814,0.165771,0.194702,0.174438,0.179443,0.115723,0.129883,0.182007,0.143188,0.212402,0.14978,0.186523,0.216797,0.14624,0.237915,0.198853,0.152344,0.184448,0.200684,0.220459,0.184082,0.170532,0.147095,0.197754,0.218262,0.146484,0.194214,0.133545,0.217285,0.206787,0.186035,0.148682,0.189331,0.116089,0.147461
4,0.151855,0.178833,0.167358,1.0,0.187622,0.153442,0.151489,0.151245,0.207031,0.146606,0.172974,0.177002,0.183594,0.172119,0.168335,0.163086,0.172485,0.16748,0.175049,0.224243,0.196533,0.179443,0.136841,0.218506,0.184082,0.177612,0.217041,0.140137,0.229736,0.174072,0.182007,0.210327,0.145996,0.193481,0.125977,0.209106,0.160156,0.178101,0.177612,0.170898,0.17395,0.171875,0.218018,0.184937,0.177856,0.171143,0.151978
5,0.189087,0.196167,0.183594,0.187622,1.0,0.165527,0.15918,0.21228,0.198242,0.166504,0.193237,0.147217,0.143311,0.180542,0.207031,0.151611,0.163818,0.167603,0.164673,0.192505,0.19458,0.187256,0.183105,0.175781,0.192505,0.182007,0.181641,0.152222,0.136475,0.132812,0.236816,0.224365,0.197876,0.17395,0.176514,0.193115,0.225952,0.169922,0.136963,0.151611,0.155273,0.165405,0.156128,0.09491,0.191528,0.203369,0.146118


In [15]:
review_cosine_sim_df.iloc[:, review_cosine_sim_df.index.get_indexer([1])].sort_values(by=review_cosine_sim_df.iloc[:, review_cosine_sim_df.index.get_indexer([1])].columns[0], ascending=False)[1:11].index

Int64Index([3, 29, 11, 45, 7, 6, 12, 44, 36, 17], dtype='int64', name='item_idx')

In [16]:
# id 기반 추천 알고리즘
def recommendations_by_id(target_id, matrix, items, k=10):
    try:
        target_idx =  matrix.index.get_indexer([target_id])
        recom_idx = matrix.iloc[:, target_idx].sort_values(by= matrix.iloc[:, target_idx].columns[0], ascending=False)[1:11].index

        # 반환한 인덱스 값은 1부터 시작하나, 실제 iloc로 접근하는 인덱스 값은 0부터 시작하므로 이를 보정해야함
        recom_idx = recom_idx-1
        recom_id = items.iloc[recom_idx, :].idx.values
        recom_title = items.iloc[recom_idx, :].name_ko.values

        target_id_list = np.full(len(range(k)), target_id)
        target_title_list = np.full(len(range(k)), items[items.idx == target_id].name_ko.values)
        
    except:
        print(recom_idx)
        print(recom_id, recom_title)
        print(target_id_list, target_title_list)
    
    d = {
        'target_id': target_id_list,
        'target_title': target_title_list,
        'recom_id'    : recom_id,
        'recom_title' : recom_title,
    }
    
    return pd.DataFrame(d)

In [17]:
recommendations_by_id(3, review_cosine_sim_df, bean_read)

Unnamed: 0,target_id,target_title,recom_id,recom_title
0,3,에티오피아 코케 G1 펄프드내추럴 (중배전),27,콜롬비아 콜롬비아 SUPREMO 워시드 디카페인 (강배전)
1,3,에티오피아 코케 G1 펄프드내추럴 (중배전),1,에티오피아 예가체프 G2 워시드 (중배전)
2,3,에티오피아 코케 G1 펄프드내추럴 (중배전),32,베트남 다낭 G1 폴리싱 (강배전)
3,3,에티오피아 코케 G1 펄프드내추럴 (중배전),37,콜롬비아 메데인 SUPREMO 워시드 (강배전)
4,3,에티오피아 코케 G1 펄프드내추럴 (중배전),41,온두라스 산티아고 푸링글라 SHB 워시드 (강배전)
5,3,에티오피아 코케 G1 펄프드내추럴 (중배전),25,인도 치크마갈루르 AA 워시드 (중배전)
6,3,에티오피아 코케 G1 펄프드내추럴 (중배전),22,인도네시아 수마트라 G1 웻헐드 (중배전)
7,3,에티오피아 코케 G1 펄프드내추럴 (중배전),42,엘 살바도르 엘 살바도르 SHG 워시드 (강배전)
8,3,에티오피아 코케 G1 펄프드내추럴 (중배전),11,브라질 세하도 SPECIALTY 펄프드내추럴 (중배전)
9,3,에티오피아 코케 G1 펄프드내추럴 (중배전),31,콜롬비아 후일라 SUPREMO 워시드 (중배전)


In [18]:
# id 기반 추천 알고리즘
def recommendation_list_by_id(target_id, matrix, items, k=10):
    try:
        target_idx =  matrix.index.get_indexer([target_id])
        recom_idx = matrix.iloc[:, target_idx].sort_values(by= matrix.iloc[:, target_idx].columns[0], ascending=False)[1:11].index

        # 반환한 인덱스 값은 1부터 시작하나, 실제 iloc로 접근하는 인덱스 값은 0부터 시작하므로 이를 보정해야함
        recom_idx = recom_idx-1
        recom_id = items.iloc[recom_idx, :].idx.values
        recom_title = items.iloc[recom_idx, :].name_ko.values
        
    except:
        print(recom_idx)
        print(recom_id, recom_title)
        print(target_id_list, target_title_list)
    
    recom_list = [dict(id = id, title = title) for id, title in zip(recom_id, recom_title)]
    
    return recom_list

In [19]:
recommendation_list_by_id(3, review_cosine_sim_df, bean_read)

[{'id': 27, 'title': '콜롬비아 콜롬비아 SUPREMO 워시드 디카페인 (강배전)'},
 {'id': 1, 'title': '에티오피아 예가체프 G2 워시드 (중배전)'},
 {'id': 32, 'title': '베트남 다낭 G1 폴리싱 (강배전)'},
 {'id': 37, 'title': '콜롬비아 메데인 SUPREMO 워시드 (강배전)'},
 {'id': 41, 'title': '온두라스 산티아고 푸링글라 SHB 워시드 (강배전)'},
 {'id': 25, 'title': '인도 치크마갈루르 AA 워시드 (중배전)'},
 {'id': 22, 'title': '인도네시아 수마트라 G1 웻헐드 (중배전)'},
 {'id': 42, 'title': '엘 살바도르 엘 살바도르 SHG 워시드 (강배전)'},
 {'id': 11, 'title': '브라질 세하도 SPECIALTY 펄프드내추럴 (중배전)'},
 {'id': 31, 'title': '콜롬비아 후일라 SUPREMO 워시드 (중배전)'}]

In [20]:
# 유사도 기준으로 추천 원두의 상위 5개를 출력
bean_recom = bean_read.copy()[['idx', 'name_ko']]
bean_recom['recommendation'] = bean_recom.apply(lambda x: recommendation_list_by_id(x.idx, review_cosine_sim_df, bean_read, k=5), axis=1)
print(bean_recom.shape)
bean_recom.head()

(47, 3)


Unnamed: 0,idx,name_ko,recommendation
0,1,에티오피아 예가체프 G2 워시드 (중배전),"[{'id': 3, 'title': '에티오피아 코케 G1 펄프드내추럴 (중배전)'..."
1,2,과테말라 안티구아 SHB 워시드 (강배전),"[{'id': 44, 'title': '콜롬비아 메데인 SUPREMO 워시드 디카페..."
2,3,에티오피아 코케 G1 펄프드내추럴 (중배전),"[{'id': 27, 'title': '콜롬비아 콜롬비아 SUPREMO 워시드 디카..."
3,4,케냐 키암부 AA 워시드 (중배전),"[{'id': 29, 'title': '브라질 세하도 NY2 FC 네추럴 (중배전)..."
4,5,콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),"[{'id': 31, 'title': '콜롬비아 후일라 SUPREMO 워시드 (중배..."


In [21]:
# 파일 저장
os.makedirs(DIR_SAVE_PATH, exist_ok=True)
# bean_recom.to_csv(path.join(DIR_SAVE_PATH, 'bean_cbf_recom.csv'), sep=',')
bean_recom.to_csv(path.join(DIR_SAVE_PATH, 'bean_cbf_by_review_recom.csv'), sep=',', index=False)

In [22]:
# bean_cbf_by_review_recom.csv
review_recom_read = pd.read_csv(path.join(DIR_SAVE_PATH, 'bean_cbf_by_review_recom.csv'), low_memory=False)

print(review_recom_read.shape)
review_recom_read.head()

(47, 3)


Unnamed: 0,idx,name_ko,recommendation
0,1,에티오피아 예가체프 G2 워시드 (중배전),"[{'id': 3, 'title': '에티오피아 코케 G1 펄프드내추럴 (중배전)'..."
1,2,과테말라 안티구아 SHB 워시드 (강배전),"[{'id': 44, 'title': '콜롬비아 메데인 SUPREMO 워시드 디카페..."
2,3,에티오피아 코케 G1 펄프드내추럴 (중배전),"[{'id': 27, 'title': '콜롬비아 콜롬비아 SUPREMO 워시드 디카..."
3,4,케냐 키암부 AA 워시드 (중배전),"[{'id': 29, 'title': '브라질 세하도 NY2 FC 네추럴 (중배전)..."
4,5,콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),"[{'id': 31, 'title': '콜롬비아 후일라 SUPREMO 워시드 (중배..."


In [23]:
print(review_read.shape)
review_read.head()

(9459, 16)


Unnamed: 0,idx,created_date,updated_date,acidity,bitterness,body,coffeeing_note,content,expired,flavor,item_idx,item_type,like,overall,sweetness,member_idx
0,1,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,7,6,8
1,2,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,11
2,3,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,5,6,29
3,4,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,37
4,5,,,7,4,4,"꽃, 과일, 부드러운, 플로럴, 레몬, 허니",커피가 괜찮아요,N,7,1,bean,0,8,6,38


In [24]:
user_reviews = review_read.loc[(review_read['member_idx'] == 1) & (review_read['item_type'] == 'bean')]
user_reviews = list(user_reviews['item_idx'].values)
user_reviews

[3, 9, 20, 23, 28, 37, 41, 46]

In [25]:
def get_recom_by_user(userIdx, data, matrix, item_type='bean'):
    try:
        user_likes = data.loc[(data['member_idx'] == 1) & (data['item_type'] == item_type)]
        user_likes = list(user_likes['item_idx'].values)
        
        recom_list = []
        for temp_list in matrix.set_index('idx').loc[user_likes]['recommendation']:
            recom_list.extend(json.loads(temp_list.replace('\'', '\"')))
            
        recom_list = [dict(t) for t in {tuple(d.items()) for d in recom_list}]
        recom_list = random.choices(recom_list, k=5)
        
    except:
        print(user_likes)
        print(recom_list)
        
    return recom_list

In [26]:
recom_list = get_recom_by_user(1, review_read, review_recom_read, 'bean')
recom_list

[{'id': 33, 'title': '브라질 산토스 NY2 FC 내추럴 (중배전)'},
 {'id': 36, 'title': '탄자니아 음빙가 AA 워시드 (강배전)'},
 {'id': 41, 'title': '온두라스 산티아고 푸링글라 SHB 워시드 (강배전)'},
 {'id': 13, 'title': '온두라스 엘 파라이소 SPECIALTY 워시드 (중배전)'},
 {'id': 2, 'title': '과테말라 안티구아 SHB 워시드 (강배전)'}]