In [1]:
from __future__ import division
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.contrib.layers import fully_connected

In [2]:
#load data
UserItemPair = pd.read_csv('/Users/snakepointid/Documents/project/JDproj/save/UserItemPairNums.csv',
                      header=0,encoding='gbk' )

In [3]:
#construct user ,item index
user_count=dict()
user2idx=dict()
idx2user=[]
for user_id in UserItemPair['user_id']:
    user_count.setdefault(user_id,0)
    user_count[user_id]+=1
idx=0   
for user_id in user_count.keys():
    user2idx[user_id]=idx
    idx+=1
    idx2user.append(user_id)
    
item_count=dict()
item2idx=dict()
idx2item=[]
for item in UserItemPair['sku_id']:
    item_count.setdefault(item,0)
    item_count[item]+=1
idx=0   
for item in item_count.keys():
    item2idx[item]=idx
    idx+=1
    idx2item.append(item)


In [4]:
#transfer the raw to index
users  = np.array([user2idx[user_id] for user_id in UserItemPair['user_id']],dtype=np.int32)
items  = np.array([item2idx[item]    for item    in UserItemPair['sku_id']] ,dtype=np.int32)
labels = np.array([label             for label   in UserItemPair['nums']]   ,dtype=np.float32)
types  = np.array(UserItemPair.loc[:,["type_1","type_2","type_3","type_4","type_5","type_6"]],dtype=np.float32)
labels = labels.reshape((labels.shape[0],1))
trainNum = int(len(users)*0.8)

train_users = users[:trainNum]
test_users  = users[trainNum:]

train_items = items[:trainNum]
test_items  = items[trainNum:]

train_types = types[:trainNum]
test_types  = types[trainNum:]

train_labels= labels[:trainNum] 
test_labels = labels[trainNum:]

In [5]:
##initial embeddings
item_Num = len(idx2item)
item_embedding_size = 100
user_Num = len(idx2user)
user_embedding_size =100
item_embeddings  = tf.Variable(tf.random_uniform([item_Num,item_embedding_size], -1.0, 1.0))
user_embeddings  = tf.Variable(tf.random_uniform([user_Num,user_embedding_size], -1.0, 1.0))

In [6]:
#initial input and output
n_outputs = 1
itemInputIdx  = tf.placeholder(tf.int32, shape=[None])
userInputIdx  = tf.placeholder(tf.int32, shape=[None])
typesInput    = tf.placeholder(tf.float32,[None,6])
y             = tf.placeholder(tf.float32,[None,n_outputs])

item_embed    = tf.nn.embedding_lookup(item_embeddings , itemInputIdx)
user_embed    = tf.nn.embedding_lookup(user_embeddings , userInputIdx)

MLPinput    = tf.concat([item_embed,user_embed,typesInput],1) 

In [7]:
#construnct mlp
n_hidden = 100
with tf.name_scope("mlp"):
    hidden = fully_connected(MLPinput, n_hidden,  scope="hidden",activation_fn=tf.nn.tanh)
    logits = fully_connected(hidden  , n_outputs, scope="outputs",activation_fn=None)

In [8]:
#define loss function 
with tf.name_scope("loss"):
    loss   = tf.reduce_mean(tf.square(y-logits), name="loss")

In [9]:
#learning and optimizer
learning_rate = 0.0001
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

In [10]:
#initial and save
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [11]:
#start training
n_epochs   = 20
batch_size = 2000
batch_num  = len(train_users)//batch_size+1
assess_size= len(train_users)//n_epochs
with tf.Session() as sess:    
    init.run()
    for epoch in range(n_epochs):
        for batch in range(batch_num):
            #feed rawdata
            item_batch       = train_items [batch*batch_size:(batch+1)*batch_size]
            user_batch       = train_users [batch*batch_size:(batch+1)*batch_size]
            type_batch       = train_types [batch*batch_size:(batch+1)*batch_size]
            y_batch          = train_labels[batch*batch_size:(batch+1)*batch_size]
            #training
            sess.run(training_op, feed_dict={ itemInputIdx: item_batch,userInputIdx:user_batch,typesInput:type_batch,y:y_batch})            
        train_loss = loss.eval(feed_dict={itemInputIdx: train_items[epoch*assess_size:(epoch+1)*assess_size],
                                          userInputIdx:train_users[epoch*assess_size:(epoch+1)*assess_size],
                                          typesInput:train_types[epoch*assess_size:(epoch+1)*assess_size],
                                          y:train_labels[epoch*assess_size:(epoch+1)*assess_size]}) 
        test_loss  = loss.eval(feed_dict={ itemInputIdx: test_items, userInputIdx:test_users, typesInput:test_types,y:test_labels}) 
        print(train_loss,test_loss)
    save_path = saver.save(sess,"/Users/snakepointid/Documents/project/JDproj/save/user_item_embed/userEmbed.ckpt")
    itemEmbed = item_embeddings.eval()
    userEmbed = user_embeddings.eval()

(0.18300074, 0.18299101)
(0.17566587, 0.17839517)
(0.17371337, 0.17574927)
(0.1707603, 0.17318141)
(0.1659361, 0.17050454)
(0.16284586, 0.1681896)
(0.15980235, 0.16643195)
(0.15777014, 0.16509221)
(0.15520877, 0.16404602)
(0.15307763, 0.16320838)
(0.15163481, 0.16251843)
(0.15012817, 0.16193706)
(0.14765762, 0.16143754)
(0.14563501, 0.16100483)
(0.14485998, 0.16062562)
(0.14199436, 0.16029236)
(0.14032802, 0.15999819)
(0.13836527, 0.15973625)
(0.13709384, 0.15950187)
(0.13464798, 0.15929215)


In [12]:
#save embedding
userEmbed = pd.DataFrame(userEmbed)
userEmbed["user_id"]=np.array(idx2user)

itemEmbed = pd.DataFrame(itemEmbed)
itemEmbed["user_id"]=np.array(idx2item)

In [13]:
userEmbed.to_csv("/Users/snakepointid/Documents/project/JDproj/save/UserEmbeddings.csv",encoding='gbk',index=False)
itemEmbed.to_csv("/Users/snakepointid/Documents/project/JDproj/save/itemEmbeddings.csv",encoding='gbk',index=False)

In [16]:
itemEmbed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,user_id
0,0.107861,0.205072,-0.121795,0.184556,0.868169,-0.749836,0.259710,-0.866092,-0.643823,-0.437111,...,-0.445880,0.435901,0.086872,-0.300827,0.419813,-0.472423,0.776982,-0.317849,-0.111887,65537
1,0.606037,0.524309,0.211047,0.442302,0.862887,-0.474269,0.415867,-0.043359,-0.592753,-0.588306,...,0.894998,-0.102745,-0.351779,-0.320390,-0.323327,1.014345,0.328849,-1.130879,-0.564725,40964
2,-0.281618,-1.091332,-0.169875,-0.830113,-0.505022,-0.074612,-0.030425,-0.382330,-0.736239,0.220831,...,-0.036719,-0.730057,0.394122,-0.907919,1.028824,-0.287280,-0.915467,-0.753911,0.281060,32774
3,0.306860,-0.466174,0.293665,0.041817,-0.272989,-0.654526,-0.164059,0.891474,-0.437584,-0.595758,...,0.801500,-1.105792,0.602443,-0.736566,0.021939,-0.805647,-0.460280,0.498275,-0.128446,49160
4,0.304092,-0.613534,-0.534608,0.059799,0.303784,0.449727,0.942327,0.455352,0.750357,-0.519371,...,-0.222234,0.664942,0.531256,0.510251,-0.932895,0.684507,-0.585223,0.237393,-0.532093,46423
5,-0.497953,-0.799793,0.638912,-0.302728,-0.078103,0.521434,-0.362792,0.329770,-0.401232,0.215051,...,-0.670143,-0.873207,-0.180752,0.678832,0.717543,0.531926,0.046940,-0.330452,-0.385107,106513
6,-0.406029,-0.855153,-0.191454,0.926666,0.564816,-0.003620,0.705480,-0.701398,0.166559,-0.579331,...,0.530384,0.277044,-0.410498,0.338661,-0.377168,-0.417636,0.545417,-0.282226,0.798228,8210
7,0.524204,0.437932,-0.527541,-0.020710,0.804460,-0.302405,0.718280,0.273529,0.240670,0.731438,...,0.153502,-0.034721,-0.728660,-0.646923,-0.893896,-0.842766,-0.110208,-0.397967,-0.210493,139287
8,0.356644,0.193557,-0.639793,-0.942852,-0.665802,0.603049,-0.985933,-0.207198,0.565578,0.271859,...,0.050766,0.381736,0.633412,0.089888,0.515030,-0.166530,0.234653,1.049916,0.122101,40984
9,-0.047564,0.144053,-0.356202,-0.544693,0.618599,0.681544,0.457813,-0.840682,-0.108157,-0.051587,...,0.138681,0.353251,0.177618,0.286086,-0.030358,-0.236077,0.472652,0.229672,0.488872,8218
