# Amazon food data 를 이용해 food 추천모델 만들기
# 데이터 url: https://nijianmo.github.io/amazon/index.html

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import os


rating_file_path = os.getenv('HOME') +'/aiffel/H3/foodrating/amazondata.csv'
#ratings_cols = ['ProductId', 'ProfileName', 'Score', 'Summary']
data1 = pd.read_csv(rating_file_path, header = None, index_col = None)
data1.columns = ['ProductId','UserId','Rating','TimeStamp']
data1.head()
data1.tail()

Unnamed: 0,ProductId,UserId,Rating,TimeStamp
5074155,B01HJHSVG6,AGOPF2VTEOP57,5.0,1538611200
5074156,B01HJHSVG6,A5X5TI4JCH9CN,3.0,1538524800
5074157,B01HJHSVG6,A1WMQQLC3TVAFI,4.0,1538438400
5074158,B01HJHSVG6,ALVP6JZRTEDY6,5.0,1538265600
5074159,B01HJHSVG6,AZFPVUZOVGBYR,5.0,1538179200


In [2]:
data1["ProductId"].nunique()

283507

# 크게 두가지 모델로 나눌수 있습니다 : 
# 1) 아이템 기반 추천시스템 2) 유저 기반 추천시스템


# 1. 아이템(ProductId) 기반 추천

In [3]:
data1.insert(loc=2, column = 'A', value = data1['ProductId'])
data1.tail()

Unnamed: 0,ProductId,UserId,A,Rating,TimeStamp
5074155,B01HJHSVG6,AGOPF2VTEOP57,B01HJHSVG6,5.0,1538611200
5074156,B01HJHSVG6,A5X5TI4JCH9CN,B01HJHSVG6,3.0,1538524800
5074157,B01HJHSVG6,A1WMQQLC3TVAFI,B01HJHSVG6,4.0,1538438400
5074158,B01HJHSVG6,ALVP6JZRTEDY6,B01HJHSVG6,5.0,1538265600
5074159,B01HJHSVG6,AZFPVUZOVGBYR,B01HJHSVG6,5.0,1538179200


In [4]:
data1 = data1.groupby(['ProductId'], as_index=False).mean()
data1

Unnamed: 0,ProductId,Rating,TimeStamp
0,0681727810,5.000000,1.451477e+09
1,0853347867,5.000000,1.508069e+09
2,1888861118,4.375000,1.427944e+09
3,1888861207,4.000000,1.337429e+09
4,1888861339,5.000000,1.322698e+09
...,...,...,...
283502,B01HJF6FRA,4.761905,1.490412e+09
283503,B01HJFBN9A,4.500000,1.526947e+09
283504,B01HJFC704,3.500000,1.517918e+09
283505,B01HJFRDDA,5.000000,1.520770e+09


In [5]:
data1['newId'] = range(1, len(data1) + 1)
data1

Unnamed: 0,ProductId,Rating,TimeStamp,newId
0,0681727810,5.000000,1.451477e+09,1
1,0853347867,5.000000,1.508069e+09,2
2,1888861118,4.375000,1.427944e+09,3
3,1888861207,4.000000,1.337429e+09,4
4,1888861339,5.000000,1.322698e+09,5
...,...,...,...,...
283502,B01HJF6FRA,4.761905,1.490412e+09,283503
283503,B01HJFBN9A,4.500000,1.526947e+09,283504
283504,B01HJFC704,3.500000,1.517918e+09,283505
283505,B01HJFRDDA,5.000000,1.520770e+09,283506


# 끝으로 가면 알파벳과 숫자가 섞인 상품 일련번호들이 나옵니다. 이것들을 숫자로 변경해줍니다.
def is_integer(n):
       try:
           int(n)
           return True
       except ValueError:
           return False
        
def string2int (a):
    b = []
    c = []
    for i in range(len(a)):
        if is_integer(int(a[i])) == False:
            x = ord(a[i])
            b.append(x)
        else:
            y = int(a[i])
            c.append(y)     
    new = b + c
    number = int(''.join(str(i) for i in new))
    return number


#for i in data1["ProductId"]:
   # a= []
   # if is_integer(int(data1["ProductId"][i])) == True:
        # a.append(data1["ProductId"])
# print(len(a))

from tqdm import tqdm

for i in tqdm(range(len(data1['ProductId']))):
              
              data1['ProductId'][i] = string2int(data1['ProductId'][i])

In [6]:
data1 = data1.drop(columns = ['ProductId', 'TimeStamp'])

# 유사한 아이템 = 평점이 비슷한 아이템

from sklearn.metrics.pairwise import cosine_similarity

product_based = cosine_similarity(data1)
product_based

In [7]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

SVD = TruncatedSVD(n_components = 1)
matrix = SVD.fit_transform(data1)
matrix.shape

(283507, 1)

In [8]:
matrix[0]

array([1.00011053])

In [9]:
data2 = data1[data1['Rating']==5]
data2

Unnamed: 0,Rating,newId
0,5.0,1
1,5.0,2
4,5.0,5
6,5.0,7
7,5.0,8
...,...,...
283490,5.0,283491
283494,5.0,283495
283496,5.0,283497
283501,5.0,283502


In [10]:
#item_based = cosine_similarity

# 2. 유저기반 추천시스템

In [11]:
user_df = pd.read_csv(rating_file_path, header = None, index_col = None)
user_df.columns = ['ProductId','UserId','Rating','TimeStamp']
user_df.head()

Unnamed: 0,ProductId,UserId,Rating,TimeStamp
0,1888861614,ALP49FBWT4I7V,5.0,1370304000
1,1888861614,A1KPIZOCLB9FZ8,4.0,1400803200
2,1888861614,A2W0FA06IYAYQE,4.0,1399593600
3,1888861614,A2PTZTCH2QUYBC,5.0,1397952000
4,1888861614,A2VNHGJ59N4Z90,4.0,1397606400


In [12]:
user_df["UserId"].nunique()

2695974

# 인기 많은 음식들 (top 30)

In [13]:
product_count = user_df.groupby('ProductId')['UserId'].count()
product_count.sort_values(ascending = False).head(30)

ProductId
B00BUKL666    11526
B00542YXFW     9083
B008QMX2SG     8903
B00D3M2QP4     8880
B000YN2GVY     7400
B000X3TPHS     7310
B01E5XTW24     7001
B000F4DKAI     6862
B0001LO3FG     6858
B000EVMNMI     6323
B002HQCWYM     6179
B00DS842HS     6131
B000Z93FQC     5980
B00CPZPYLS     5591
B000H2XXRS     5562
B00PFDH0IC     5507
B00EDHW7K2     5393
B00C1LXBFC     5311
B00M2OGS08     5251
B003OGKCDC     5012
B00XA8XWGS     4870
B005K4Q1T0     4838
B007JINB0W     4738
B006CNTR6W     4670
B00KCCKV8W     4635
B006IOKA9S     4631
B00WBUX2UM     4622
B00R7PWK7W     4617
B0014WYXYW     4550
B0010BQB6A     4320
Name: UserId, dtype: int64

# ----------------------------------------------------------------------

# 1 위 https://www.amazon.com/dp/B00BUKL666
# 2 위 https://www.amazon.com/dp/B00542YXFW
# 3 위 https://www.amazon.com/dp/B008QMX2SG

# ----------------------------------------------------------------------

# 3. 내가 선호하는 식품 5가지 골라서 rating 에 추가

In [14]:

my_favorite = ['B00WBUX2UM', 'B00BUKL666', 'B008QMX2SG',
              'B00PFDH0IC', 'B000H2XXRS']

my_foodlist = pd.DataFrame({'UserId': ['kyuhwan']*5,  'Rating':[5]*5,'TimeStamp' : [12345678]*5,
                           'ProductId': ['B00WBUX2UM', 'B00BUKL666', 'B008QMX2SG','B00PFDH0IC', 'B000H2XXRS']})

if not user_df.isin({'UserId': ['kyuhwan']})['UserId'].any():
    user_df = user_df.append(my_foodlist)
user_df.tail(10)

Unnamed: 0,ProductId,UserId,Rating,TimeStamp
5074155,B01HJHSVG6,AGOPF2VTEOP57,5.0,1538611200
5074156,B01HJHSVG6,A5X5TI4JCH9CN,3.0,1538524800
5074157,B01HJHSVG6,A1WMQQLC3TVAFI,4.0,1538438400
5074158,B01HJHSVG6,ALVP6JZRTEDY6,5.0,1538265600
5074159,B01HJHSVG6,AZFPVUZOVGBYR,5.0,1538179200
0,B00WBUX2UM,kyuhwan,5.0,12345678
1,B00BUKL666,kyuhwan,5.0,12345678
2,B008QMX2SG,kyuhwan,5.0,12345678
3,B00PFDH0IC,kyuhwan,5.0,12345678
4,B000H2XXRS,kyuhwan,5.0,12345678


In [15]:
# 고유한 유저, 프로덕트 찾기
user_unique = user_df['UserId'].unique()
product_unique = user_df['ProductId'].unique()

# 유저, 프로덕트를 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
product_to_idx = {v:k for k,v in enumerate(product_unique)}

print(user_to_idx['kyuhwan'])

2695974


In [16]:
# CSR Matrix 를 직접 만들어 보기


# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# user_to_idx.get을 통해 UserId 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 

temp_user_data = user_df['UserId'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(user_df):   # 모든 row가 정상적으로 인덱싱되었다면
    print('UserId column indexing OK!!')
    user_df['UserId'] = temp_user_data   # data['UserId']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('UserId column indexing Fail!!')

# product_to_idx을 통해 product 컬럼도 동일한 방식으로 인덱싱해 줍니다.

temp_product_data = user_df['ProductId'].map(product_to_idx.get).dropna()
if len(temp_product_data) == len(user_df):
    print('Product column indexing OK!!')
    user_df['ProductId'] = temp_product_data
else:
    print('Product column indexing Fail!!')
    

user_df

UserId column indexing OK!!
Product column indexing OK!!


Unnamed: 0,ProductId,UserId,Rating,TimeStamp
0,0,0,5.0,1370304000
1,0,1,4.0,1400803200
2,0,2,4.0,1399593600
3,0,3,5.0,1397952000
4,0,4,4.0,1397606400
...,...,...,...,...
0,52430,2695974,5.0,12345678
1,32660,2695974,5.0,12345678
2,36542,2695974,5.0,12345678
3,48734,2695974,5.0,12345678


# Compressed Sparse Row Matrix

유저 x 아이템 평가행렬 -> 어림잡아도 엄청난 메모리가 필요하다 (600GB 이상)

유저가 좋아하지 않는 아이템에 대한 정보까지 모두 행렬에 포함되어 계산되기 때문.

평가행렬 내의 대부분의 공간은 0으로 채워짐. 이를 Sparse Matrix 라고 부름.

이런 메모리 낭비를 최소화 하기 위해 유저가 좋아하는 아이템에 대해서만 정보만을 저장하면서

전체 행렬 형태를 유추할 수있는 데이터 구조가 필요

# Sparse Matrix 에 관한 URL :

1) https://stackoverflow.com/questions/53254104/cant-understand-scipy-sparse-csr-matrix-example/62118005#62118005

2)https://lovit.github.io/nlp/machine%20learning/2018/04/09/sparse_mtarix_handling/#csr-matrix

In [17]:
#CSR MATRIX 만들기
from scipy.sparse import csr_matrix

num_user = user_df['UserId'].nunique()
num_product = user_df['ProductId'].nunique()

csr_data = csr_matrix((user_df['Rating'], (user_df.UserId, user_df.ProductId)), shape= (num_user, num_product))
csr_data

<2695975x283507 sparse matrix of type '<class 'numpy.float64'>'
	with 4889629 stored elements in Compressed Sparse Row format>

# Matrix Factorization 모델 학습

implict 라는 패키지를 이용

als(AlternatingLeastSqaures) 모델을 사용. Matrix Factorization 에서 쪼개진 두 Feature Matrix 를 한꺼번에 훈련하는것은 잘 수렴하지 않기 때문에,
한쪽을 고정시키고 다른 쪽을 학습하는 방식을 번갈아 수행하는 ALS 방식이 효과적임

In [32]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

#implict 에서 권장하고 있는 부분
os.environ['OPENBLAS_NUM_THREAD'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['MKL_NUM_THREADS'] = '1'

#implict AlternatingLeastSquares 모델의 선언


#ALS 클래스의 __init__ 파라미터 살펴보기
# 1. factors : 유저와 아이템의 벡터를 몇 차원으로 할것인지
# 2. regularization : 과적합 방지하기 위해 정규화 값을 얼마나 사용할 것인지
# 3. use_gpu : GPU 를 사용할 것잉ㄴ지
# 4. iterations : epoch 과 같은 의미. 데이터를 몇 번 반복해서 학습할 것인지
als_model = AlternatingLeastSquares(factors = 100, regularization = 0.01, use_gpu = False, iterations = 15,
                                   dtype = np.float32)

#als 모델은 input 으로 (item X user 꼴의 matrix 를 받기 때문에 Transpose 해줍니다.)

csr_data_transpose = csr_data.T
csr_data_transpose

<283507x2695975 sparse matrix of type '<class 'numpy.float64'>'
	with 4889629 stored elements in Compressed Sparse Column format>

In [20]:
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [21]:
print(user_to_idx['kyuhwan'])

2695974


# 4.  내가 선호하는 5가지 음식 중 하나와 그 외의 음식 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해보기

In [22]:
kyuhwan, kindbar  = user_to_idx['kyuhwan'], product_to_idx['B00BUKL666']
kyuhwan_vector, kindbar_vector = als_model.user_factors[kyuhwan], als_model.item_factors[kindbar]

In [23]:
kyuhwan_vector

array([ 0.01129489,  0.04096334, -0.00577985, -0.00590205,  0.07755878,
       -0.06322526, -0.00510072, -0.00418894,  0.11207666,  0.01770949,
       -0.01975668, -0.01212848,  0.00103267,  0.0133176 , -0.03333454,
       -0.0363686 , -0.00903931, -0.07347597, -0.04957177,  0.00277478,
        0.01253075,  0.05073196,  0.03608559,  0.05735427, -0.02375843,
        0.01799873,  0.01340177,  0.02693825, -0.01712429, -0.0433847 ,
        0.03641789, -0.01008456, -0.00330016, -0.00847558, -0.00263662,
       -0.00396724,  0.00656064, -0.00924345,  0.04775057,  0.02061106,
        0.00468099,  0.03317811, -0.01896046,  0.00097972, -0.04447523,
        0.00487354,  0.0083517 ,  0.02071826,  0.06002178, -0.01761264,
       -0.00336663, -0.00068119, -0.0123321 ,  0.00154238, -0.01348539,
       -0.03668587,  0.01025478, -0.01293186,  0.02776764, -0.01906144,
       -0.04083011,  0.00432826, -0.01512842, -0.05008803, -0.01569166,
       -0.00785131, -0.004356  , -0.02313512, -0.00979655,  0.00

In [24]:
kindbar_vector

array([ 0.57114923, -0.09055092,  0.44896844, -0.5552083 ,  1.3209987 ,
       -0.4251691 , -1.271683  , -0.2908415 ,  1.35349   , -0.07170092,
        0.4280137 , -0.30975282, -0.20632899, -0.25358474, -0.5318306 ,
        0.4074585 , -0.79447085, -0.9576449 , -0.6319869 ,  0.5025351 ,
        0.45968324,  0.7326911 ,  0.68397427, -0.5509796 , -0.42640266,
        0.29005158,  0.8404033 ,  0.760751  , -0.09723096,  0.1352186 ,
        0.7184506 ,  0.30158418,  0.16671288,  0.08700465, -0.1442102 ,
        0.25285956,  0.7402699 ,  0.01570844,  0.10847622,  0.89266086,
        0.59716946,  0.705307  , -0.25042567, -0.47973546, -1.3944639 ,
        0.5355415 , -0.15178743,  0.5796125 ,  0.313731  , -1.0108156 ,
        0.32240164,  0.63729715,  0.21237078, -0.11833993,  0.20344071,
       -0.00333385, -0.5645638 ,  0.5032024 ,  0.30253834, -0.2504076 ,
       -0.5174136 , -0.03737095, -1.073693  , -0.04242508, -0.8200378 ,
       -1.3057884 ,  0.07404641, -0.52277184, -0.25162068, -1.29

In [25]:
# Kyuhwan 과 Kind Bar 를 내적하는 코드

np.dot(kyuhwan_vector, kindbar_vector)

0.90133846

In [26]:
# 나의 Coconut Oil 에 대한 선호도는 어떻게 예측할지

coconut = product_to_idx['B000H2XXRS']
coconut_vector = als_model.item_factors[coconut]
np.dot(kyuhwan_vector, coconut_vector)

0.83609337

# 5. 내가 좋아하는 음식과 비슷한 음식을 추천받아 봅시다.

In [27]:
# coconut oil 과 비슷한 음식 추천받기

favorite_food = 'B000H2XXRS'
product_id = product_to_idx[favorite_food]

similar_food = als_model.similar_items(product_id, N=15)
similar_food

[(3548, 0.9999998),
 (100039, 0.99617106),
 (89510, 0.9961059),
 (134233, 0.9960929),
 (235822, 0.9955393),
 (207697, 0.99551946),
 (76432, 0.9954799),
 (176812, 0.9954417),
 (153831, 0.9954204),
 (61598, 0.99511987),
 (194043, 0.99485576),
 (157048, 0.9948324),
 (138715, 0.99476665),
 (125536, 0.9947575),
 (225286, 0.9947118)]

In [28]:
#ASIN 표시하기

idx_to_product = {v:k for k,v in product_to_idx.items()}
[idx_to_product[i[0]]for i in similar_food]

['B000H2XXRS',
 'B003ZFL40W',
 'B001SAU20E',
 'B007Z91P62',
 'B00SVS6VR4',
 'B00KO9APFW',
 'B0012AOJ22',
 'B00FA8NKCM',
 'B00BOUQHK2',
 'B0000D9N8U',
 'B00IBS0H0Q',
 'B00C87I7U8',
 'B0094EPG2A',
 'B006J2T3C4',
 'B00ONGQ4IO']

# 그럴싸한 추천들:
# https://www.amazon.com/dp/B007Z91P62
# https://www.amazon.com/dp/B01D6CW1SW
# https://www.amazon.com/dp/B006J2T3C4


# 다소 이상한 추천들:
# https://www.amazon.com/dp/B0094EPG2A
# https://www.amazon.com/dp/B00FA8NKCM
# https://www.amazon.com/dp/B00EMZZHWK
# https://www.amazon.com/dp/B00IBS0H0Q


# 6. 내가 좋아할만한 음식들을 추천받아보기

In [29]:

# 음식 추천받기

user = user_to_idx['kyuhwan']

#recommend 에서는 user * item CSR Matrix 를 받습니다.

food_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items = True)
food_recommended

[(34929, 0.7286428),
 (49753, 0.57969815),
 (44800, 0.5617294),
 (50708, 0.35929188),
 (36231, 0.27760273),
 (42423, 0.23773493),
 (53622, 0.22054072),
 (40653, 0.21935567),
 (10085, 0.21340473),
 (3283, 0.20488684),
 (44736, 0.20325615),
 (24928, 0.18187878),
 (8487, 0.17946078),
 (40282, 0.15845677),
 (60627, 0.15500343),
 (2145, 0.14880371),
 (14073, 0.1423859),
 (22958, 0.14209083),
 (55248, 0.13939375),
 (39084, 0.1367101)]

In [30]:
[idx_to_product[i[0]]for i in food_recommended]

['B00D3M2QP4',
 'B00R7PWK7W',
 'B00KSN9TME',
 'B00TBUHRRO',
 'B00E1P4L1Y',
 'B00IZL255O',
 'B00ZEA5ESY',
 'B00HNTPF7E',
 'B001E5E0D8',
 'B000GAT6NG',
 'B00KRFLDBS',
 'B006IOKA9S',
 'B0015DGDR0',
 'B00H889MGK',
 'B01G0S3Y44',
 'B000EDDS6Q',
 'B0027Z5J6G',
 'B005K4Q1YA',
 'B014LT0712',
 'B00G6I14Y6']

# 그럴싸한 추천들:

# https://www.amazon.com/dp/B00R7PWK7W
# https://www.amazon.com/dp/B00KSN9TME
# https://www.amazon.com/dp/B00CZ2KYRU
# https://www.amazon.com/dp/B00MYRXIIS
# https://www.amazon.com/dp/B0027Z5J6G
# https://www.amazon.com/dp/B000GW0U9I




In [31]:
#이 추천에 기여한 정도 (Coconut Oil 의 경우)

coconut = product_to_idx['B000H2XXRS']
explain = als_model.explain(user, csr_data, itemid = coconut)

[(idx_to_product[i[0]], i[1]) for i in explain [1]]

[('B000H2XXRS', 0.836853716456697),
 ('B00BUKL666', 0.008877806120408088),
 ('B00WBUX2UM', 0.006538893398980163),
 ('B008QMX2SG', -0.005890761459414735),
 ('B00PFDH0IC', -0.011400553289685271)]