In [65]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
import numpy as np

In [92]:
from helper.dataset_helper import get_ml_100k_dataset_simple
from helper.dataset_helper import get_sampled_ml_100k_data

Final user selected: 733
Final ratings selected: 28979


In [144]:
# A helper function to prepare the training data
def get_training_data():
    """Get training data to be feed into Embedding-NN network.
    
    Due to the fact that we are using embeddings, the training data would
    be a little different from what we have evern seen before. Each
    training data would be n movie ids. [id1, id2, ..., idn].
    """
    from helper.dataset_helper import get_sampled_ml_100k_data
    
    num_movies = 800
    min_num_ratings = 10
    num_neg = 50
    
    data, movie_indexer = get_sampled_ml_100k_data(
        num_movies=num_movies,
        min_num_ratings=min_num_ratings,
        num_neg=num_neg
    )
    
    num_users = len(data)
    num_movie_id = 5
    
    training_data = np.zeros((num_users, num_movie_id))
    training_labels = np.zeros((num_users, num_movies))
    testing_data = []
    for i in range(num_users):
        training_data[i, :] = np.array(data[i][1][:num_movie_id])
        testing_data.append((data[i][0], data[i][1][num_movie_id:], data[i][2]))
        label = np.zeros(num_movies)
        label[data[i][1][:num_movie_id]] = 1
        training_labels[i, :] = label
    return training_data, training_labels, testing_data

training_data, training_labels, testing_data = get_training_data()

Final user selected: 892
Final ratings selected: 46791


In [107]:
print training_data.shape
print training_labels.shape

(714, 5)
(714, 500)


In [132]:
def get_training_batch(training_data, training_labels, batch_size=20):
    random_ids = np.random.choice(np.arange(training_data.shape[0]), batch_size, replace=False)
    return training_data[random_ids, :], training_labels[random_ids, :]

In [162]:
"""
Set up the neural network, the overall workflow is:
    
    fetch_embeddings -> avg -> fc -> relu -> softmax loss
"""
embedding_dim = 128
n_hidden_1 = 256
n_hidden_2 = 128
num_movie_id = 5
num_classes = 800
batch_size = 20
training_step = 100

final_embeddings = np.zeros((training_data.shape[0], num_classes))

graph = tf.Graph()

with graph.as_default():
    
    x = tf.placeholder(tf.int32, shape=[batch_size, num_movie_id])
    y = tf.placeholder(tf.int32, shape=[batch_size, num_classes])
    
    embeddings = tf.Variable(
        tf.random_uniform([num_classes, embedding_dim], -1.0, 1.0))
    
    embed = tf.nn.embedding_lookup(embeddings, x)
    reshaped_embed = tf.reshape(embed, (batch_size, embedding_dim * num_movie_id))
    
    W1 = tf.Variable(tf.random_normal([embedding_dim * num_movie_id, n_hidden_1]))
    b1 = tf.Variable(tf.random_normal([n_hidden_1]))
    
    layer_1 = tf.add(tf.matmul(reshaped_embed, W1), b1)
    layer_1 = tf.nn.relu(layer_1)
    
    W2 = tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2]))
    b2 = tf.Variable(tf.random_normal([n_hidden_2]))
    
    layer_2 = tf.add(tf.matmul(layer_1, W2), b2)
    layer_2 = tf.nn.relu(layer_2)
    
    # TODO: make sure this step is correct
#     Wout = tf.Variable(tf.random_normal([n_hidden_2, num_classes]))
#     bout = tf.Variable(tf.random_normal([num_classes]))
#     out = tf.add(tf.matmul(layer_2, Wout), bout)
    out = tf.matmul(layer_2, tf.transpose(embeddings))
    
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=out, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    
    init = tf.initialize_all_variables()

with tf.Session(graph=graph) as session:
    init.run()
    print("Initialized")
    
    for i in range(training_step):
        batch_data, batch_labels = get_training_batch(training_data, training_labels, batch_size)
        _, loss_val = session.run([optimizer, loss], feed_dict={
            x: batch_data,
            y: batch_labels,
        })
    
    for i in range(np.ceil(training_data.shape[0]/batch_size).astype(np.int32)):
        batch_data = training_data[i * batch_size : (i+1) * batch_size, :]
        batch_labels = training_labels[i * batch_size : (i+1) * batch_size, :]
        
        final_embeddings[i * batch_size : (i+1) * batch_size, :] = out.eval(feed_dict={x: batch_data, y: batch_labels})
    
    print final_embeddings[:10, :]
    

Initialized
[[ 3581.89819336  4085.59887695  6174.83203125 ..., -2070.44580078
   2527.50073242   317.35189819]
 [ 4339.84277344  1314.72180176  4335.30224609 ..., -2849.47167969
   2786.47363281  1166.46813965]
 [ 2509.38964844   199.05534363  3796.55297852 ..., -2327.47363281
   2860.08911133   374.29153442]
 ..., 
 [ 2635.72460938  -856.71337891  2433.45947266 ...,  -592.17401123
   2783.60180664   766.546875  ]
 [ 2531.4765625   1285.96337891   503.64266968 ..., -1359.64868164
    136.55072021 -1672.65246582]
 [ 2803.03637695  2634.57714844  2380.50439453 ...,  -541.58630371
   1162.68566895  -698.96185303]]


In [167]:
print final_embeddings.shape

(892, 800)
