In [1]:
from __future__ import division
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.contrib.layers import fully_connected

In [2]:
#load data
UserItemPair = pd.read_csv('/Users/snakepointid/Documents/project/JDproj/save/UserItemPair.csv',
                      header=0,encoding='gbk' )

In [3]:
#construct user ,item index
user_count=dict()
user2idx=dict()
idx2user=[]
for user_id in UserItemPair['user_id']:
    user_count.setdefault(user_id,0)
    user_count[user_id]+=1
idx=0   
for user_id in user_count.keys():
    user2idx[user_id]=idx
    idx+=1
    idx2user.append(user_id)
    
item_count=dict()
item2idx=dict()
idx2item=[]
for item in UserItemPair['sku_id']:
    item_count.setdefault(item,0)
    item_count[item]+=1
idx=0   
for item in item_count.keys():
    item2idx[item]=idx
    idx+=1
    idx2item.append(item)


In [4]:
#transfer the raw to index
users  = np.array([user2idx[user_id] for user_id in UserItemPair['user_id']],dtype=np.int32)
items  = np.array([item2idx[item]    for item    in UserItemPair['sku_id']] ,dtype=np.int32)
labels = np.array([label             for label    in UserItemPair['label']] ,dtype=np.float32)
labels = labels.reshape((labels.shape[0],1))
trainNum = int(len(users)*0.8)

train_users = users[:trainNum]
test_users  = users[trainNum:]

train_items = items[:trainNum]
test_items  = items[trainNum:]

train_labels= labels[:trainNum] 
test_labels = labels[trainNum:]

In [5]:
##initial embeddings
item_Num = len(idx2item)
item_embedding_size = 100
user_Num = len(idx2user)
user_embedding_size =100
item_embeddings  = tf.Variable(tf.random_uniform([item_Num,item_embedding_size], -1.0, 1.0))
user_embeddings  = tf.Variable(tf.random_uniform([user_Num,user_embedding_size], -1.0, 1.0))

In [6]:
#initial input and output
n_outputs = 1
itemInputIdx  = tf.placeholder(tf.int32, shape=[None])
userInputIdx  = tf.placeholder(tf.int32, shape=[None])
y             = tf.placeholder(tf.float32,[None,n_outputs])

item_embed  = tf.nn.embedding_lookup(item_embeddings , itemInputIdx)
user_embed  = tf.nn.embedding_lookup(user_embeddings , userInputIdx)

MLPinput    = tf.concat([item_embed,user_embed],1) 

In [7]:
#construnct mlp
n_hidden = 100
with tf.name_scope("mlp"):
    hidden = fully_connected(MLPinput, n_hidden,  scope="hidden",activation_fn=tf.tanh)
    logits = fully_connected(hidden  , n_outputs, scope="outputs",activation_fn=None)

In [8]:
#define loss function 
with tf.name_scope("loss"):
    xentropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y)
    loss     = tf.reduce_mean(xentropy, name="loss")

In [9]:
#learning and optimizer
learning_rate = 0.00001
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [10]:
#eval the model
with tf.name_scope("eval"):
    prediction = tf.round(tf.sigmoid(logits))
    correct    = tf.cast(tf.equal(prediction, y),tf.float32)
    accuracy   = tf.reduce_mean(tf.cast(correct, tf.float32))

In [11]:
#initial and save
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [12]:
#start training
n_epochs   = 20
batch_size = 2000
batch_num  = len(train_users)//batch_size+1
with tf.Session() as sess:    
    init.run()
    for epoch in range(n_epochs):
        for batch in range(batch_num):
            #feed rawdata
            item_batch       = train_items [batch*batch_size:(batch+1)*batch_size]
            user_batch       = train_users [batch*batch_size:(batch+1)*batch_size]   
            y_batch          = train_labels[batch*batch_size:(batch+1)*batch_size]
            #training
            sess.run(training_op, feed_dict={ itemInputIdx: item_batch,userInputIdx:user_batch,y:y_batch})            
    train_acc = accuracy.eval(feed_dict={ itemInputIdx: train_items,userInputIdx:train_users,y:train_labels}) 
    Test_acc  = accuracy.eval(feed_dict={ itemInputIdx: test_items, userInputIdx:test_users, y:test_labels}) 
    print(train_acc,Test_acc)
    save_path = saver.save(sess,"/Users/snakepointid/Documents/project/JDproj/save/user_item_embed/userEmbed.ckpt")
    itemEmbed = item_embeddings.eval()
    userEmbed = user_embeddings.eval()

(0.89830625, 0.89741957)


In [13]:
#save embedding
userEmbed = pd.DataFrame(userEmbed)
userEmbed["user_id"]=np.array(idx2user)

itemEmbed = pd.DataFrame(itemEmbed)
itemEmbed["user_id"]=np.array(idx2item)

In [14]:
userEmbed.to_csv("/Users/snakepointid/Documents/project/JDproj/save/UserEmbeddings.csv",encoding='gbk',index=False)
itemEmbed.to_csv("/Users/snakepointid/Documents/project/JDproj/save/itemEmbeddings.csv",encoding='gbk',index=False)

In [15]:
len(itemEmbed)

4377