In [2]:
from __future__ import division
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.contrib.layers import fully_connected

In [3]:
#load data
UserItemPair = pd.read_csv('/Users/snakepointid/Documents/project/JDproj/save/UserItemPairNums.csv',
                      header=0,encoding='gbk' )

In [4]:
#construct user ,item index
user_count=dict()
user2idx=dict()
idx2user=[]
for user_id in UserItemPair['user_id']:
    user_count.setdefault(user_id,0)
    user_count[user_id]+=1
idx=0   
for user_id in user_count.keys():
    user2idx[user_id]=idx
    idx+=1
    idx2user.append(user_id)
    
item_count=dict()
item2idx=dict()
idx2item=[]
for item in UserItemPair['sku_id']:
    item_count.setdefault(item,0)
    item_count[item]+=1
idx=0   
for item in item_count.keys():
    item2idx[item]=idx
    idx+=1
    idx2item.append(item)


In [5]:
#transfer the raw to index
users  = np.array([user2idx[user_id] for user_id in UserItemPair['user_id']],dtype=np.int32)
items  = np.array([item2idx[item]    for item    in UserItemPair['sku_id']] ,dtype=np.int32)
labels = np.array([label             for label   in UserItemPair['nums']]   ,dtype=np.float32)
types  = np.array(UserItemPair.loc[:,["type_1","type_2","type_3","type_4","type_5","type_6"]],dtype=np.float32)
labels = labels.reshape((labels.shape[0],1))
trainNum = int(len(users)*0.8)

train_users = users[:trainNum]
test_users  = users[trainNum:]

train_items = items[:trainNum]
test_items  = items[trainNum:]

train_types = types[:trainNum]
test_types  = types[trainNum:]

train_labels= labels[:trainNum] 
test_labels = labels[trainNum:]

In [6]:
##initial embeddings
item_Num = len(idx2item)
item_embedding_size = 100
user_Num = len(idx2user)
user_embedding_size =100
item_embeddings  = tf.Variable(tf.random_uniform([item_Num,item_embedding_size], -1.0, 1.0))
user_embeddings  = tf.Variable(tf.random_uniform([user_Num,user_embedding_size], -1.0, 1.0))

In [7]:
#initial input and output
n_outputs = 1
itemInputIdx  = tf.placeholder(tf.int32, shape=[None])
userInputIdx  = tf.placeholder(tf.int32, shape=[None])
typesInput    = tf.placeholder(tf.float32,[None,6])
y             = tf.placeholder(tf.float32,[None,n_outputs])

item_embed    = tf.nn.embedding_lookup(item_embeddings , itemInputIdx)
user_embed    = tf.nn.embedding_lookup(user_embeddings , userInputIdx)

MLPinput    = tf.concat([item_embed,user_embed,typesInput],1) 

In [8]:
#construnct mlp
n_hidden = 100
with tf.name_scope("mlp"):
    hidden = fully_connected(MLPinput, n_hidden,  scope="hidden",activation_fn=tf.nn.elu)
    logits = fully_connected(hidden  , n_outputs, scope="outputs",activation_fn=None)

In [9]:
#define loss function 
with tf.name_scope("loss"):
    loss   = tf.reduce_mean(tf.square(y-logits), name="loss")

In [10]:
#learning and optimizer
learning_rate = 0.0001
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [11]:
#initial and save
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [12]:
#start training
n_epochs   = 20
batch_size = 2000
batch_num  = len(train_users)//batch_size+1
assess_size= len(train_users)//n_epochs
with tf.Session() as sess:    
    init.run()
    for epoch in range(n_epochs):
        for batch in range(batch_num):
            #feed rawdata
            item_batch       = train_items [batch*batch_size:(batch+1)*batch_size]
            user_batch       = train_users [batch*batch_size:(batch+1)*batch_size]
            type_batch       = train_types [batch*batch_size:(batch+1)*batch_size]
            y_batch          = train_labels[batch*batch_size:(batch+1)*batch_size]
            #training
            sess.run(training_op, feed_dict={ itemInputIdx: item_batch,userInputIdx:user_batch,typesInput:type_batch,y:y_batch})            
        train_loss = loss.eval(feed_dict={itemInputIdx: train_items[epoch*assess_size:(epoch+1)*assess_size],
                                          userInputIdx:train_users[epoch*assess_size:(epoch+1)*assess_size],
                                          typesInput:train_types[epoch*assess_size:(epoch+1)*assess_size],
                                          y:train_labels[epoch*assess_size:(epoch+1)*assess_size]}) 
        test_loss  = loss.eval(feed_dict={ itemInputIdx: test_items, userInputIdx:test_users, typesInput:test_types,y:test_labels}) 
        print(train_loss,test_loss)
    save_path = saver.save(sess,"/Users/snakepointid/Documents/project/JDproj/save/user_item_embed/userEmbed.ckpt")
    itemEmbed = item_embeddings.eval()
    userEmbed = user_embeddings.eval()

(0.18251908, 0.18393567)
(0.17585729, 0.17769624)


KeyboardInterrupt: 

In [None]:
#save embedding
userEmbed = pd.DataFrame(userEmbed)
userEmbed["user_id"]=np.array(idx2user)

itemEmbed = pd.DataFrame(itemEmbed)
itemEmbed["user_id"]=np.array(idx2item)

In [None]:
userEmbed.to_csv("/Users/snakepointid/Documents/project/JDproj/save/UserEmbeddings.csv",encoding='gbk',index=False)
itemEmbed.to_csv("/Users/snakepointid/Documents/project/JDproj/save/itemEmbeddings.csv",encoding='gbk',index=False)

In [None]:
len(itemEmbed)