In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
from gensim.models import Word2Vec

In [72]:
data = pd.read_csv("./../data/purchase_history_dec_2019.csv")

In [73]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,f.o.x,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,cnd,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,runail,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,freedecor,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5


In [74]:
data.shape

(3533286, 9)

In [75]:
data.columns

Index(['event_time', 'event_type', 'product_id', 'category_id',
       'category_code', 'brand', 'price', 'user_id', 'user_session'],
      dtype='object')

In [76]:
len(data.user_id.unique())

370154

In [77]:
data.isnull().sum()

event_time             0
event_type             0
product_id             0
category_id            0
category_code    3474821
brand            1510289
price                  0
user_id                0
user_session         779
dtype: int64

In [78]:
data.drop(columns=['brand'], axis=0, inplace=True)

In [79]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,price,user_id,user_session
0,2019-12-01 00:00:00 UTC,remove_from_cart,5712790,1487580005268456287,,6.27,576802932,51d85cb0-897f-48d2-918b-ad63965c12dc
1,2019-12-01 00:00:00 UTC,view,5764655,1487580005411062629,,29.05,412120092,8adff31e-2051-4894-9758-224bfa8aec18
2,2019-12-01 00:00:02 UTC,cart,4958,1487580009471148064,,1.19,494077766,c99a50e8-2fac-4c4d-89ec-41c05f114554
3,2019-12-01 00:00:05 UTC,view,5848413,1487580007675986893,,0.79,348405118,722ffea5-73c0-4924-8e8f-371ff8031af4
4,2019-12-01 00:00:07 UTC,view,5824148,1487580005511725929,,5.56,576005683,28172809-7e4a-45ce-bab0-5efa90117cd5


In [80]:
data.isnull().sum()

event_time             0
event_type             0
product_id             0
category_id            0
category_code    3474821
price                  0
user_id                0
user_session         779
dtype: int64

In [81]:
data = data.dropna(axis=0, subset=['user_session'])

In [110]:
len(data.product_id.unique())

44620

In [82]:
data = data[['event_time', 'event_type', 'product_id', 'category_id', 'price', 'user_id', 'user_session']]

In [83]:
data.shape

(3532507, 7)

In [84]:
len(data.user_id.unique())

370108

In [85]:
len(data.user_session.unique())

839812

In [86]:
unique_sessions = data.user_session.unique()

In [87]:
unique_sessions

array(['51d85cb0-897f-48d2-918b-ad63965c12dc',
       '8adff31e-2051-4894-9758-224bfa8aec18',
       'c99a50e8-2fac-4c4d-89ec-41c05f114554', ...,
       '5ecf66ed-634c-465e-a6b5-6b601c4fef19',
       '2dde9867-9e71-4a64-880d-aa68b66aae6d',
       '4c6d80bb-5dd3-4fbb-b592-187b51db2753'], dtype=object)

In [88]:
len(unique_sessions)

839812

In [100]:
random.shuffle(unique_sessions)

train_session_ids = [unique_sessions[i] for i in range(round(0.02*len(unique_sessions)))]
data_train = data[data['user_session'].isin(train_session_ids)]
data_test = data[-data['user_session'].isin(train_session_ids)]

In [101]:
product_sequence_train = []

for i in tqdm(train_session_ids):
    tmp = data_train[data_train['user_session']==i]['product_id'].tolist()
    product_sequence_train.append(tmp)

100%|████████████████████████████████████| 16796/16796 [01:44<00:00, 160.66it/s]


In [105]:
model = Word2Vec(window=10, sg=0, hs=0, 
                negative=10, alpha=0.03,
                min_alpha=0.0007, seed=14)

In [106]:
model.build_vocab(product_sequence_train, progress_per=5)

In [107]:
model.train(product_sequence_train, total_examples=model.corpus_count, epochs=10, report_delay=1)

(469721, 727620)

In [108]:
model.init_sims(replace=True)

  model.init_sims(replace=True)


In [109]:
print(model)

Word2Vec<vocab=4217, vector_size=100, alpha=0.03>


In [111]:
X = model.wv.vectors
vocab = model.wv.index_to_key
X.shape

(4217, 100)

In [115]:
def similar_products(v, n = 6):
    # extract most similar products for the input vector
    ms = model.wv.most_similar(v, topn= n+1)[1:]
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
    return new_ms

array([[ 0.13509724,  0.18907428,  0.01293003, ...,  0.07044698,
        -0.03016715,  0.16180661],
       [ 0.02747949,  0.20090538, -0.16633743, ..., -0.0615028 ,
         0.06487887,  0.03131498],
       [-0.03875175, -0.05321369, -0.01213351, ..., -0.03571969,
         0.08463014,  0.04992935],
       ...,
       [ 0.02380967,  0.17560537, -0.05225698, ...,  0.02176874,
         0.10282256,  0.11810097],
       [ 0.03596873,  0.1654074 , -0.05040731, ...,  0.01581946,
         0.10784441,  0.11830115],
       [ 0.02698806,  0.18417692, -0.06003487, ...,  0.01638652,
         0.10710956,  0.12110864]], dtype=float32)

### These are the embeddings for the cosmetics product generated using the Word2Vec model. These values can be used to predict/recommend items