In [1]:
import os
import os.path as path
import gc
import re
import math
import json
import random

In [2]:
from datetime import datetime
import time

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_row', 50)
pd.set_option('display.max_columns', 100)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# 디렉토리 기본 경로 설정
DIR_PATH = path.join('..', 'data', 'sql_dummy')
DIR_SAVE_PATH = path.join('..', 'data', 'output')

print(DIR_PATH)
print(DIR_SAVE_PATH)

..\data\sql_dummy
..\data\output


In [6]:
# sql_bean.csv
bean_read = pd.read_csv(path.join(DIR_PATH, 'sql_bean.csv'), low_memory=False, encoding='cp949')

print(bean_read.shape)
bean_read.head()

(47, 8)


Unnamed: 0,idx,created_date,updated_date,name_ko,name_en,summary,thumbnail,user_grade
0,1,,,에티오피아 예가체프 G2 워시드 (중배전),에티오피아 예가체프 G2 워시드 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0
1,2,,,과테말라 안티구아 SHB 워시드 (강배전),과테말라 안티구아 SHB 워시드 (강배전),아이템 요약 영역 입니다!!!,default_bean.png,0
2,3,,,에티오피아 코케 G1 펄프드내추럴 (중배전),에티오피아 코케 G1 펄프드내추럴 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0
3,4,,,케냐 키암부 AA 워시드 (중배전),케냐 키암부 AA 워시드 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0
4,5,,,콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),콜롬비아 콜롬비아 SUPREMO 워시드 (중배전),아이템 요약 영역 입니다!!!,default_bean.png,0


In [7]:
# sql_member.csv
member_read = pd.read_csv(path.join(DIR_PATH, 'sql_member.csv'), low_memory=False, encoding='cp949')

print(member_read.shape)
member_read.head()

(1000, 12)


Unnamed: 0,idx,created_date,updated_date,age_range,expired,gender,hashcode,member_email,nickname,profileImg,role,sns_type
0,1,2023-03-01 12:00:00.000000,,30~39,N,male,UADIR,jungjuon8355@gmail.com,정주온,http://k.kakaocdn.net/dn/dpk9l1/btqmGhA2lKL/Oz...,ROLE_MEMBER,KAKAO
1,2,2023-03-01 12:43:12.000000,,30~39,N,male,7ILKE,pjc8001@naver.com,박종찬,http://k.kakaocdn.net/dn/dpk9l1/btqmGhA2lKL/Oz...,ROLE_MEMBER,KAKAO
2,3,2023-03-01 13:26:24.000000,,20~29,N,male,Y4WZC,taehwan279@gmail.com,최태환,http://k.kakaocdn.net/dn/dpk9l1/btqmGhA2lKL/Oz...,ROLE_MEMBER,KAKAO
3,4,2023-03-01 14:09:36.000000,,30~39,N,female,TK7SR,minyeon6103@naver.com,신민연,http://k.kakaocdn.net/dn/dpk9l1/btqmGhA2lKL/Oz...,ROLE_MEMBER,KAKAO
4,5,2023-03-01 14:52:48.000000,,40~49,N,female,OHG9F,kwonyiyun8189@gmail.com,권예윤,http://k.kakaocdn.net/dn/dpk9l1/btqmGhA2lKL/Oz...,ROLE_MEMBER,KAKAO


In [8]:
# sql_like_list.csv
like_list_read = pd.read_csv(path.join(DIR_PATH, 'sql_like_list.csv'), low_memory=False, encoding='cp949')

print(like_list_read.shape)
like_list_read.head()

(9366, 7)


Unnamed: 0,idx,created_date,updated_date,expired,item_idx,item_type,member_idx
0,1,,,N,1,bean,3
1,2,,,N,1,bean,6
2,3,,,N,1,bean,10
3,4,,,N,1,bean,13
4,5,,,N,1,bean,14


In [9]:
member_like_df = pd.merge(like_list_read, member_read[['idx', 'age_range', 'gender']], left_on='member_idx', right_on='idx')
member_like_df = pd.merge(member_like_df, bean_read[['idx', 'name_ko']], left_on='item_idx', right_on='idx')
member_like_df = member_like_df[['item_idx', 'name_ko', 'member_idx', 'age_range', 'gender']]
member_like_df['count'] = 1
print(member_like_df.shape)
member_like_df.head()

(9366, 6)


Unnamed: 0,item_idx,name_ko,member_idx,age_range,gender,count
0,1,에티오피아 예가체프 G2 워시드 (중배전),3,20~29,male,1
1,1,에티오피아 예가체프 G2 워시드 (중배전),6,40~49,female,1
2,1,에티오피아 예가체프 G2 워시드 (중배전),10,20~29,male,1
3,1,에티오피아 예가체프 G2 워시드 (중배전),13,40~49,female,1
4,1,에티오피아 예가체프 G2 워시드 (중배전),14,40~49,male,1


In [10]:
member_like_df.loc[(member_like_df['age_range'] == '20~29') & (member_like_df['item_idx'] == 1)]['count'].sum()

104

In [11]:
def get_top_k(loc_index, matrix, items, axis=0, k=10):
    top_k_idx = []
    
    if axis == 0:
        top_k_idx = matrix.loc[loc_index].sort_values()[-k:].index
    else:
        top_k_idx = matrix.loc[:, loc_index].sort_values()[-k:].index
        
    try:
        top_k_idx = top_k_idx-1
        recom_id = items.iloc[top_k_idx, :].idx.values
        recom_title = items.iloc[top_k_idx, :].name_ko.values
    except:
        print(top_k_idx)
        print(recom_id, recom_title)
    
    recom_list = [dict(id = id, title = title) for id, title in zip(recom_id, recom_title)]
    
    return recom_list

In [12]:
gender_like_df = member_like_df.pivot_table(index=['gender'], columns=['item_idx'], values='count', aggfunc='sum')
gender_like_df.fillna(0, inplace=True)
gender_like_df.head()

item_idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
female,107,103,85,110,108,98,120,100,86,109,98,99,107,110,104,99,90,105,117,89,103,89,108,122,101,94,110,107,98,98,95,112,99,104,97,112,104,109,105,103,86,93,115,107,110,101,114
male,101,98,96,87,106,88,76,92,103,85,85,100,97,97,90,79,98,100,97,94,91,87,108,103,116,108,93,88,95,99,101,101,97,96,96,99,92,90,118,88,101,89,93,113,94,96,105


In [13]:
get_top_k('male', gender_like_df, bean_read, k=5)

[{'id': 26, 'title': '에티오피아 시다모 G2 워시드 디카페인 (중배전)'},
 {'id': 23, 'title': '베트남 베트남 G1 워시드 (중배전)'},
 {'id': 44, 'title': '콜롬비아 메데인 SUPREMO 워시드 디카페인 (강배전)'},
 {'id': 25, 'title': '인도 치크마갈루르 AA 워시드 (중배전)'},
 {'id': 39, 'title': '파푸아뉴기니 와기벨리 AA 워시드 (강배전)'}]

In [14]:
# 좋아요 합계 기준으로 성별 추천 원두의 상위 5개를 출력
recom_df = pd.DataFrame([[i + 1, age_range] for i, age_range in enumerate(gender_like_df.index)], columns=['idx', 'gender'])
recom_df['recommendation'] = recom_df.apply(lambda x: get_top_k(x.gender, gender_like_df, bean_read, k=5), axis=1)
print(recom_df.shape)
recom_df.head()

(2, 3)


Unnamed: 0,idx,gender,recommendation
0,1,female,"[{'id': 47, 'title': '브라질 산토스 NY2 FC 워시드 디카페인 ..."
1,2,male,"[{'id': 26, 'title': '에티오피아 시다모 G2 워시드 디카페인 (중..."


In [15]:
# 파일 저장
os.makedirs(DIR_SAVE_PATH, exist_ok=True)
recom_df.to_csv(path.join(DIR_SAVE_PATH, 'like_recom_by_gender.csv'), sep=',', index=False)

In [16]:
# like_recom_by_gender.csv
recom_read = pd.read_csv(path.join(DIR_SAVE_PATH, 'like_recom_by_gender.csv'), low_memory=False)
print(recom_read.shape)
recom_read.head()

(2, 3)


Unnamed: 0,idx,gender,recommendation
0,1,female,"[{'id': 47, 'title': '브라질 산토스 NY2 FC 워시드 디카페인 ..."
1,2,male,"[{'id': 26, 'title': '에티오피아 시다모 G2 워시드 디카페인 (중..."


In [17]:
def get_recom_by_gender(gender, matrix, k=5):
    try:
        recom_list = matrix.set_index('gender').loc[gender]['recommendation']
        recom_list = json.loads(recom_list.replace('\'', '\"'))
        recom_list = [dict(t) for t in {tuple(d.items()) for d in recom_list}]
        
    except:
        print(itemIdx)
        print(recom_list)
        
    return recom_list[:k]

In [18]:
recom_list = get_recom_by_gender('male', recom_read)
recom_list

[{'id': 26, 'title': '에티오피아 시다모 G2 워시드 디카페인 (중배전)'},
 {'id': 39, 'title': '파푸아뉴기니 와기벨리 AA 워시드 (강배전)'},
 {'id': 44, 'title': '콜롬비아 메데인 SUPREMO 워시드 디카페인 (강배전)'},
 {'id': 25, 'title': '인도 치크마갈루르 AA 워시드 (중배전)'},
 {'id': 23, 'title': '베트남 베트남 G1 워시드 (중배전)'}]