In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazonfooddataset/Grocery_and_Gourmet_Food.csv


# This notebook is for the people who want to get a basic concept of recommendation system. 
# I used the Amazon Food rating data (from 1996 to 2018)


# Dataset used : https://nijianmo.github.io/amazon/index.html

* There are many types of datasets on the website. The dataset I used was a simple version which only contains the ratings. 
* This 'Grocery and Gourmet food' data includes reviews in the range May 1996 - Oct 2018.
* Please read the description on the website above for more details.

In [2]:
data = pd.read_csv('/kaggle/input/amazonfooddataset/Grocery_and_Gourmet_Food.csv')
data

Unnamed: 0,1888861614,ALP49FBWT4I7V,5.0,1370304000
0,1888861614,A1KPIZOCLB9FZ8,4.0,1400803200
1,1888861614,A2W0FA06IYAYQE,4.0,1399593600
2,1888861614,A2PTZTCH2QUYBC,5.0,1397952000
3,1888861614,A2VNHGJ59N4Z90,4.0,1397606400
4,1888861614,ATQL0XOLZNHZ4,1.0,1392940800
...,...,...,...,...
5074154,B01HJHSVG6,AGOPF2VTEOP57,5.0,1538611200
5074155,B01HJHSVG6,A5X5TI4JCH9CN,3.0,1538524800
5074156,B01HJHSVG6,A1WMQQLC3TVAFI,4.0,1538438400
5074157,B01HJHSVG6,ALVP6JZRTEDY6,5.0,1538265600


Since the raw data has no column names, I put the column names (based on the data description page) manually. 

In [3]:
data = pd.read_csv('/kaggle/input/amazonfooddataset/Grocery_and_Gourmet_Food.csv', header = None, index_col = None)
data.columns = ['ProductId', 'UserId', 'Rating', 'Timestamp']
data

Unnamed: 0,ProductId,UserId,Rating,Timestamp
0,1888861614,ALP49FBWT4I7V,5.0,1370304000
1,1888861614,A1KPIZOCLB9FZ8,4.0,1400803200
2,1888861614,A2W0FA06IYAYQE,4.0,1399593600
3,1888861614,A2PTZTCH2QUYBC,5.0,1397952000
4,1888861614,A2VNHGJ59N4Z90,4.0,1397606400
...,...,...,...,...
5074155,B01HJHSVG6,AGOPF2VTEOP57,5.0,1538611200
5074156,B01HJHSVG6,A5X5TI4JCH9CN,3.0,1538524800
5074157,B01HJHSVG6,A1WMQQLC3TVAFI,4.0,1538438400
5074158,B01HJHSVG6,ALVP6JZRTEDY6,5.0,1538265600


In [4]:
# Number of unique product id - 283,507 

data['ProductId'].nunique()

283507

There could be two kinds of recommendation systems for the data like this:

# 1. ProductId-based recommendation
* We can use cosine_similarity package which is provided from sklearn (from sklearn.metrics.pairwise import cosine_similarity)
* In this case, the similarity of the products would be calculated by each product's rating. In other words, the recommendation based on the ProductId is not a good approach since a rating doesn't represent the whole product feature.

# 2. UserId-based recommendation

* So we use 'UserId-based recommendation' for this data

In [5]:
# Number of UserId - 2,695,974 

data['UserId'].nunique()

2695974

# Top 30 Amazon Grocery & Gourmet food (1996~2018)
* ProductId is ASIN number of Amazon product. Since the 'metadata' is too large to open here (and also, this notebook is just for showing the concept of recommendation system) I manually show the product by typing the ProductId on www.amazon.com/dp/ProductId

In [6]:
product_count = data.groupby('ProductId')['UserId'].count()
product_count.sort_values(ascending = False).head(30)

ProductId
B00BUKL666    11526
B00542YXFW     9083
B008QMX2SG     8903
B00D3M2QP4     8880
B000YN2GVY     7400
B000X3TPHS     7310
B01E5XTW24     7001
B000F4DKAI     6862
B0001LO3FG     6858
B000EVMNMI     6323
B002HQCWYM     6179
B00DS842HS     6131
B000Z93FQC     5980
B00CPZPYLS     5591
B000H2XXRS     5562
B00PFDH0IC     5507
B00EDHW7K2     5393
B00C1LXBFC     5311
B00M2OGS08     5251
B003OGKCDC     5012
B00XA8XWGS     4870
B005K4Q1T0     4838
B007JINB0W     4738
B006CNTR6W     4670
B00KCCKV8W     4635
B006IOKA9S     4631
B00WBUX2UM     4622
B00R7PWK7W     4617
B0014WYXYW     4550
B0010BQB6A     4320
Name: UserId, dtype: int64

# FYI

* Number 1 product : www.amazon.com/dp/B00BUKL666 (KIND healthy grains bar. Not suprised)

* Number 2 product : www.amazon.com/dp/B00542YXFW (a bag of some tea. Interesting)

* Number 3 product : www.amazon.com/dp/B008QMX2SG (another category for KIND healthy snack)

* Number 4 product : www.amazon.com/dp/B00D3M2QP4 (again, another category for KIND healthy snack. Please note that I'm not related with the brand at all.)

* Number 5 product : www.amazon.com/dp/B000YN2GVY (Organic unfiltered apple cider vinegar. Many people(especially Americans) like apple cider vinegar. Not surprised)




# Now, I am adding myself to the data and will see how the model recommends foods for me

In [7]:
# First, I have to lookup the product I want to add before I add it into the data (because the data don't have product launched after 2018)
# I used the ASIN code for Monster Energy Zero Ultra(B00MEFXEB6) and it was in the data.

np.where(data.ProductId=='B00MEFXEB6')

(array([4734160, 4734161, 4734162, 4734163, 4734164, 4734165, 4734166,
        4734167]),)

In [8]:
# My top 5 favorite foods converted to ASIN number (most of those selected from Top 30)

# Energy drink, Coconut Oil,Sparkling Juice, Matcha Green Tea Powder, Beef Jerkey

my_favorite = ['B00MEFXEB6', 'B000H2XXRS', 'B0014WYXYW','B00PFDH0IC', 'B000GW0U9I']
my_foodlist = pd.DataFrame({'UserId' : ['yohann'] * 5, 'Rating' : [5] * 5 , 'Timestamp' : [12345678] *5, 'ProductId': ['B00MEFXEB6', 'B000H2XXRS', 'B0014WYXYW','B00PFDH0IC', 'B000GW0U9I']})


In [9]:
if not data.isin({'UserId' : ['yohann']})['UserId'].any():
    data = data.append(my_foodlist)
    
data.tail(10)

Unnamed: 0,ProductId,UserId,Rating,Timestamp
5074155,B01HJHSVG6,AGOPF2VTEOP57,5.0,1538611200
5074156,B01HJHSVG6,A5X5TI4JCH9CN,3.0,1538524800
5074157,B01HJHSVG6,A1WMQQLC3TVAFI,4.0,1538438400
5074158,B01HJHSVG6,ALVP6JZRTEDY6,5.0,1538265600
5074159,B01HJHSVG6,AZFPVUZOVGBYR,5.0,1538179200
0,B00MEFXEB6,yohann,5.0,12345678
1,B000H2XXRS,yohann,5.0,12345678
2,B0014WYXYW,yohann,5.0,12345678
3,B00PFDH0IC,yohann,5.0,12345678
4,B000GW0U9I,yohann,5.0,12345678


In [10]:
# Finding an unique user and product

user_unique = data['UserId'].unique()
product_unique = data['ProductId'].unique()

# indexing the users and products

user_to_idx = {v:k for k,v in enumerate(user_unique)}
product_to_idx = {v:k for k,v in enumerate(product_unique)}

print(user_to_idx['yohann'])

2695974


# building a CSR Matrix

In [11]:
# Indexing

temp_user_data = data['UserId'].map(user_to_idx.get).dropna()

if len(temp_user_data) == len(data):   
    print('UserId Column Indexing Completed.')
    data['UserId'] = temp_user_data   
else:
    print('UserId Column Indexing Failed.')

# Same on the product

temp_product_data = data['ProductId'].map(product_to_idx.get).dropna()
if len(temp_product_data) == len(data):
    print('Product Column Indexing Completed.')
    data['ProductId'] = temp_product_data
else:
    print('Product Column Indexing Failed.')
    

data

UserId Column Indexing Completed.
Product Column Indexing Completed.


Unnamed: 0,ProductId,UserId,Rating,Timestamp
0,0,0,5.0,1370304000
1,0,1,4.0,1400803200
2,0,2,4.0,1399593600
3,0,3,5.0,1397952000
4,0,4,4.0,1397606400
...,...,...,...,...
0,215176,2695974,5.0,12345678
1,3548,2695974,5.0,12345678
2,8407,2695974,5.0,12345678
3,48734,2695974,5.0,12345678


# Compressed Sparse Row Matrix

UserId x Product matrix requires huge memories. 

Furthermore, the calculation will also contains the product information that the user doesn't like.

So, in order to minimize the memory loss, I only used the product that the user likes. 


In [12]:
#CSR MATRIX

from scipy.sparse import csr_matrix

num_user = data['UserId'].nunique()
num_product = data['ProductId'].nunique()

csr_data = csr_matrix((data['Rating'], (data.UserId, data.ProductId)), shape= (num_user, num_product))
csr_data

<2695975x283507 sparse matrix of type '<class 'numpy.float64'>'
	with 4889629 stored elements in Compressed Sparse Row format>

# Using ALS (Alternating Least Squares) model

I used the package called 'implicit' (als model)

Two feature matrices came from Matrix Factorization are hard to train all at once. So this ALS model fixes one side and train the other side alternately.

In [13]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# recommended part from the implicit package:

os.environ['OPENBLAS_NUM_THREAD'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['MKL_NUM_THREADS'] = '1'


# ALS class's __init__ parameters:

# 1. factors : the dimension of UserId x Product vector.
# 2. regularization : for preventing overfitting problem. Using regularization. 0.01 is okay in general.
# 3. use_gpu : usage of GPU 
# 4. iterations : it is like 'epoch' 

als_model = AlternatingLeastSquares(factors = 100, regularization = 0.01, use_gpu = False, iterations = 15,
                                   dtype = np.float32)

# ALS model takes 'Product x User' matrix as an input, so we have to convert it (Transpose)

csr_data_transpose = csr_data.T
csr_data_transpose

<283507x2695975 sparse matrix of type '<class 'numpy.float64'>'
	with 4889629 stored elements in Compressed Sparse Column format>

# Train ALS model (it will take few mins)

In [14]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

# The expected preference of a new random product

In [15]:
yohann, monsterUltra  = user_to_idx['yohann'], product_to_idx['B00MEFXEB6']
yohann_vector, monster_vector = als_model.user_factors[yohann], als_model.item_factors[monsterUltra]

In [16]:
yohann_vector

array([ 0.0280412 ,  0.00104707, -0.00725823,  0.0004568 ,  0.01394059,
        0.02044821,  0.02471047, -0.05008356,  0.00011146,  0.01103988,
       -0.0115184 ,  0.01607864, -0.02289671, -0.0829011 , -0.02958436,
       -0.02477126,  0.01996651, -0.01480245, -0.07213052,  0.01306678,
        0.07175541,  0.05391318,  0.01183824, -0.02164719,  0.00419191,
        0.01187771,  0.00639431,  0.0021899 , -0.01308006,  0.0035989 ,
       -0.02674981,  0.03976754,  0.02036567, -0.00060093, -0.0131316 ,
        0.00552269, -0.01331506, -0.01058685, -0.02853989, -0.02680918,
        0.0579425 , -0.06159671, -0.02601128, -0.00120092, -0.0089854 ,
        0.06422527,  0.03516239,  0.00461997,  0.02568302,  0.06078439,
       -0.09329107,  0.00976146, -0.02413942, -0.03849553, -0.029462  ,
        0.00942253, -0.01624532,  0.03458072,  0.02458128, -0.00832511,
        0.02390402, -0.0147893 ,  0.03759838, -0.0675895 ,  0.00762251,
       -0.0125618 , -0.01111676, -0.02301624,  0.00684169, -0.02

In [17]:
monster_vector

array([ 2.25261529e-03,  4.11143003e-04,  4.51144646e-04,  1.58105313e-03,
        9.09029390e-04,  3.19979386e-03,  1.66182965e-03, -1.01429340e-03,
       -3.76790937e-04,  6.50849310e-04,  1.30004797e-03,  9.88679007e-04,
       -2.88301148e-04, -8.30486068e-04,  2.00724593e-04,  2.36375665e-04,
        1.46773667e-03,  1.45195983e-03, -5.64904010e-04,  4.06403502e-04,
        2.11686641e-03,  2.66070385e-03,  4.08441818e-04, -2.38084438e-04,
        6.27423229e-04,  1.74646033e-04,  4.75943059e-04,  9.11708863e-04,
       -8.08945857e-04,  1.37352129e-03, -1.65675185e-04,  1.44594896e-03,
        3.74452560e-04,  1.54924998e-03, -2.67698022e-04, -1.50516746e-04,
        2.96741400e-05,  8.30553297e-04,  7.40485557e-04, -2.96874117e-04,
        2.38940725e-03, -5.98842860e-04,  1.00615245e-04,  6.97613214e-05,
        1.16786272e-04,  2.83348467e-03,  2.16831267e-03,  3.74045601e-04,
       -3.80468380e-04,  1.32308435e-03, -1.63133699e-03,  1.38906692e-03,
       -6.81236270e-04, -

In [18]:
# the dot product of Yohann and Monster vector (my preference for Monster drink)

np.dot(yohann_vector, monster_vector)

0.002155561

In [19]:
# the expected preference of 'Yohann and Kind bar' (which was not added on my favorite list)

kindbar = product_to_idx['B00BUKL666']
kindbar_vector = als_model.item_factors[kindbar]
np.dot(yohann_vector, kindbar_vector)

0.0011508199

# Recommendation from the model 1 (based on the similar product)

In [20]:
# Get recommendation similar to 'Kindbar'

kindbar = 'B00BUKL666'
product_id = product_to_idx[kindbar]

similar_food = als_model.similar_items(product_id, N= 15)
similar_food

[(32660, 1.0000001),
 (175993, 0.96449995),
 (198644, 0.96158445),
 (122781, 0.96071243),
 (89471, 0.9596692),
 (200497, 0.9567569),
 (256278, 0.9521734),
 (106983, 0.9497872),
 (39371, 0.9488897),
 (198561, 0.9472875),
 (166749, 0.9461572),
 (65118, 0.94591475),
 (273168, 0.9458501),
 (252960, 0.9455037),
 (85820, 0.94547856)]

In [21]:
# Convert the indices to ASIN numbers

idx_to_product = {v:k for k,v in product_to_idx.items()}
[idx_to_product[i[0]] for i in similar_food]

['B00BUKL666',
 'B00F6UH8JK',
 'B00J1NRROE',
 'B005ZC3KSM',
 'B001SAQ2IA',
 'B00JD7MUJ0',
 'B014INNQXW',
 'B001F0BEZE',
 'B00GFYV9WI',
 'B00J1NLACE',
 'B00DPEBFWA',
 'B000E39T8W',
 'B00IGHJGOU',
 'B012BMQI28',
 'B001JTIG48']

# --------------------------------------------------------

# Some recommendations seem reasonable:

* https://www.amazon.com/dp/B00F6UH8JK - Hershey Special Dark Chocolate Topping
* https://www.amazon.com/dp/B00F78I7ZK - Candy coated almonds


# Some are not:

* https://www.amazon.com/dp/B007NJFQ1O - Badia Mojo Marina 
* https://www.amazon.com/dp/B001F0BEZE - Biscuit cut Ham


# ---------------------------------------------------------

# Recommendation from the model 2 (the food I might like) 

In [22]:
user = user_to_idx['yohann']

food_recommended = als_model.recommend(user, csr_data, N= 20, filter_already_liked_items = True)
food_recommended

[(2502, 0.29457158),
 (3275, 0.2847087),
 (3283, 0.26859477),
 (21737, 0.2562136),
 (25583, 0.2234787),
 (53061, 0.22289169),
 (38624, 0.21818003),
 (8529, 0.20274155),
 (59187, 0.18623903),
 (27493, 0.17907582),
 (14273, 0.17361183),
 (12153, 0.16576426),
 (3761, 0.16268213),
 (13597, 0.15615922),
 (7400, 0.15563159),
 (54668, 0.14837888),
 (13871, 0.14748618),
 (7644, 0.14602089),
 (5745, 0.14441697),
 (9919, 0.14408919)]

In [23]:
[idx_to_product[i[0]] for i in food_recommended]

['B000EVMNMI',
 'B000GARX3G',
 'B000GAT6NG',
 'B0058AN1N0',
 'B006VXU6ZO',
 'B00XTVD9JG',
 'B00FRTS2CW',
 'B0015P54R8',
 'B01CUW7HPG',
 'B00819SSGK',
 'B0029JU5SM',
 'B001LG940E',
 'B000HRS7OM',
 'B001XSMANI',
 'B000YGISMC',
 'B012YAV43A',
 'B0025WGHEE',
 'B0010BQB6A',
 'B000PDY3P0',
 'B001E55ZQO']

# --------------------------------------------------------

# Some recommendations seem reasonable:

* https://www.amazon.com/dp/B00HNTPF7E - organic coconut oil (different brand)
* https://www.amazon.com/dp/B014LT0712 - organic matcha powder (different type)
* https://www.amazon.com/dp/B001XUO8AY - tonic water

# Some are not:

https://www.amazon.com/dp/B0015DGDR0 - mentos mint flavor

# --------------------------------------------------------

In [24]:
# Contribution of the Coconut Oil for this recommendation

coconut = product_to_idx['B000H2XXRS']
explain = als_model.explain(user, csr_data, itemid = coconut)

[(idx_to_product[i[0]], i[1]) for i in explain [1]]

[('B000H2XXRS', 0.8383105034926566),
 ('B00PFDH0IC', 0.002106246797757572),
 ('B00MEFXEB6', 0.00043891626462188937),
 ('B000GW0U9I', -0.003198341451185146),
 ('B0014WYXYW', -0.005469194224204648)]

# Overall

So as you can see, the recommendation model does work. However, it is not very accurate since I didn't add enough data for the user 'yohann'. Besides, the model was only working on 'ratings'. Please note that this notebook is written for a part of the general concept of recommendation system. 
