In [1]:
import pandas as pd
import numpy as np
from gensim.models import doc2vec, word2vec, keyedvectors
pd.set_option('chained_assignment',None)

Using Theano backend.


### corpus

In [2]:
order_products = pd.read_csv('../data/driver/driver_order_products.csv').drop('reordered',axis=1)
orders = pd.read_csv('../data/driver/driver_order.csv')[['order_id','user_id']]
data = orders.merge(order_products, on='order_id')
data = data.sort_values(by=['order_id','add_to_cart_order'])
print(order_products.shape, orders.shape, data.shape)
data.head()

(34010012, 5) (3421083, 2) (34010012, 6)


Unnamed: 0,order_id,user_id,product_id,aisle_id,department_id,add_to_cart_order
18590058,1,112108,49302,120,16,1
18590059,1,112108,11109,108,16,2
18590060,1,112108,10246,83,4,3
18590061,1,112108,49683,83,4,4
18590062,1,112108,43633,95,15,5


In [3]:
prefix = np.array(['p_'] * data.shape[0])
data['product_id'] = np.core.defchararray.add(prefix, data['product_id'].values.astype(np.str))
data = data[['order_id','user_id','product_id']]
data = data.groupby(['user_id','order_id'])['product_id'].apply(','.join).reset_index()
data.head()

Unnamed: 0,user_id,order_id,product_id
0,1,431534,"p_196,p_12427,p_10258,p_25133,p_10326,p_17122,..."
1,1,473747,"p_196,p_12427,p_10258,p_25133,p_30450"
2,1,550135,"p_196,p_10258,p_12427,p_25133,p_13032"
3,1,1187899,"p_196,p_25133,p_38928,p_26405,p_39657,p_10258,..."
4,1,2254736,"p_196,p_12427,p_10258,p_25133,p_26405"


### wordvec model

In [17]:
corpus = []

for line in list(data['product_id'].values):
    corpus += [line.split(',')]

In [6]:
products = pd.read_csv('../data/driver/driver_product.csv')
lookup = products.set_index('product_id')['department_id'].to_dict()

def accuracy(key, values):
    key = lookup[int(key[2:])]
    correct = 0.
    for value in values:
        try:
            if lookup[int(value[0][2:])] == key:
                correct += 1
        except KeyError:
            pass
    return correct / len(values)    

In [22]:
#### params = {}
params['size'] = 16
params['window'] = 3 #tuned
params['min_count'] = 0
params['sample'] = 1e-3 #tuned
params['negative'] = 20
params['workers'] = 4
params['hs'] = 0
params['seed'] = 108

model = word2vec.Word2Vec(**params, iter=1)
model.build_vocab(corpus)

words = list(model.wv.vocab.keys())
sample = np.random.choice(words,5000)
score = np.mean([accuracy(x ,model.most_similar([x])) for x in sample])
print('start score:', score)

alpha = 0.1

for epoch in range(7):
    model.train(corpus, total_examples=model.corpus_count, epochs=1, start_alpha=alpha, end_alpha=alpha)
    scores = [accuracy(x ,model.most_similar([x])) for x in sample]
    print('alpha:', round(alpha,4),'score:', np.round([np.mean(scores), np.std(scores)],2))
    alpha = alpha * 0.95

start score: 0.0782
alpha: 0.1 score: [ 0.4   0.33]
alpha: 0.095 score: [ 0.44  0.34]
alpha: 0.0902 score: [ 0.46  0.34]
alpha: 0.0857 score: [ 0.47  0.34]
alpha: 0.0815 score: [ 0.48  0.35]
alpha: 0.0774 score: [ 0.48  0.35]
alpha: 0.0735 score: [ 0.49  0.35]


In [23]:
model.wv.save_word2vec_format('../data/gensim/wordvectors1.txt', binary=False)

In [24]:
wordvecs = pd.read_csv('../data/gensim/wordvectors1.txt', sep=' ', header = None, skiprows=1)
wordvecs.columns = ['id'] + ['pv_' + str(x) for x in range(16)]
prodvecs = wordvecs[wordvecs['id'].str[:2] == 'p_']
prodvecs['product_id'] = prodvecs['id'].map(lambda x : int(x.split('_')[1]))
prodvecs = prodvecs.groupby('product_id').mean().reset_index()
prodvecs.columns = ['product_id'] + ['prdwv_' + str(x) for x in range(16)]
print(prodvecs.shape)
prodvecs.to_csv('../data/gensim/prodvecs1.csv', index=False)

(49686, 17)
