<a href="https://colab.research.google.com/github/ethanstykes/ncf/blob/master/Neural_Collaborative_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Collaborative Filtering

Neural network based collaborative filtering for recommending new products by analyzing feedbacks from users. Intended to be utilized in areas including movies, music, news, books, and products in general. In this project, I demonstrate movie recommandation using the Netflix Prize dataset, learning from both explicit and implcit feedbacks.

In [0]:
import numpy as np
import os
import tensorflow as tf
import time

  return f(*args, **kwds)


### Extracting data from files to create a user-movie matrix
The datasets contain over 100 million ratings from 480 thousand
randomly-chosen, anonymous Netflix customers over 17 thousand movie titles.

The "training_set" directory contains 17770 files, one
per movie.  The first line of each file contains the movie id followed by a
colon.  Each subsequent line in the file corresponds to a rating from a customer
and its date in the following format:

CustomerID,Rating,Date

- MovieIDs range from 1 to 17770 sequentially.
- CustomerIDs range from 1 to 2649429, with gaps. There are 480189 users.
- Ratings are on a five star (integral) scale from 1 to 5.
- Dates have the format YYYY-MM-DD.

We ignore the dates and extract user id's and corresponding movie ratings to form a user-movie matrix.

In [0]:
tic = time.time()
num_movies = 17770
num_user_ids = 2649429 
num_users = 480189
user_movies = np.zeros((num_users, num_movies))
user_dict = {} # user_id -> user_row
movie_ids = []

user_count = 0
file_count = 0

for filename in os.listdir("dataset/training_set/"):
    movie_file = open("dataset/training_set/"+filename)
    movie_data = movie_file.read().split("\n")
    movie_id = int(movie_data[0].strip(":"))
    movie_ids.append(movie_id)
    #print(movie_id)
    for i in range(1, len(movie_data) -1 ):
        user_rating = movie_data[i].split(",")
        user_id = user_rating[0]
        rating = user_rating[1]
        #print(user_id)
        if user_id not in user_dict:
            user_dict[user_id] = user_count
            user_movies[user_count, movie_id - 1] = rating
            user_count += 1
        else:
            user_movies[user_dict[user_id], movie_id - 1] = rating
    if movie_id > 17770: 
        break
    if file_count%1000 == 0:
        print("Files loaded:", file_count)
    file_count+=1
    
toc = time.time()
print("time elapsed:",(toc - tic))
print("number of users:", user_count)

Files loaded: 0
Files loaded: 1000
Files loaded: 2000
Files loaded: 3000
Files loaded: 4000
Files loaded: 5000
Files loaded: 6000
Files loaded: 7000
Files loaded: 8000
Files loaded: 9000
Files loaded: 10000
Files loaded: 11000
Files loaded: 12000
Files loaded: 13000
Files loaded: 14000
Files loaded: 15000
Files loaded: 16000
Files loaded: 17000
time elapsed: 705.0603220462799
number of users: 480189


In [0]:
#analyze the data
user_id = 1488844
movie_id = 1

print(user_movies[user_dict[str(user_id)], movie_id - 1])
j=0
for rating in user_movies[user_dict["1956732"]]:
    if rating>0:
        j+=1
print(j)

3.0
167


In [0]:
DON'T RUN!
#shuffle the user-movie matrix (unused)
user_movies_train_dup = user_movies[:336132]
np.random.shuffle(user_movies_train_dup)
user_movies_train_dup

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [0]:
DON'T RUN!
#test the shuffle
user_movies_train_users = user_movies[:10000]
nonzero_indices = np.nonzero(user_movies_train_users)
nonzero_indices = np.array([nonzero_indices[0], nonzero_indices[1]])
print(nonzero_indices)
np.random.shuffle(nonzero_indices.T)
print(nonzero_indices.shape)
print(nonzero_indices[:, :10100])

[[    0     0     0 ...  9999  9999  9999]
 [   17    29   142 ... 17559 17626 17708]]
(2, 8952158)
[[ 3650  8360  1715 ...  2592  8772  4846]
 [11637  6892 10428 ... 17338  7369 14605]]


### Processing the user-movie matrix to create an input matrix with sparse vectors as rows 

Each row of the input matrix will contain a concatenation of feature vectors of users and movies. Corresponding ratings are stored in a different vector.

In [0]:
tic = time.time()

user_movies_train_users = user_movies[:10000]
#print(user_movies_train_users.shape)
#user_movies_test = user_movies[336132:]

nonzero_indices = np.nonzero(user_movies_train_users)
nonzero_indices = np.array([nonzero_indices[0], nonzero_indices[1]]) #do the shuffle after this

#shuffle
np.random.shuffle(nonzero_indices.T)
count_nonzero_indices = 10100
nonzero_indices = nonzero_indices[:, :count_nonzero_indices]

users = nonzero_indices[0]
movies = nonzero_indices[1]

print("Number of ratings:", count_nonzero_indices)
user_movies_train = np.zeros((count_nonzero_indices, num_users + num_movies))
#print(user_movies_train.shape)
ratings = np.zeros((count_nonzero_indices))
#print(ratings.shape)
#user_movies_log = np.empty((count_nonzero_indices, 2))
#user_movies_train[0] = np.ones((num_users + num_movies, 1))

for i in range(count_nonzero_indices):
    rating = user_movies_train_users[users[i], movies[i]]
    ratings[i] = rating
    user_vector = np.expand_dims(user_movies[users[i]], axis =1)
    #print(user_vector.shape)
    movie_vector = np.expand_dims(user_movies[:, movies[i]] , axis=1)
    #print(movie_vector.shape)
    user_movies_train[i] = np.concatenate((user_vector, movie_vector), axis=0)[:,0]
    #user_movies_log[i][0], user_movies_log[i][1] = (users[i], movies[i])
    if(i%100 == 0):
        print("completed:", i)
    
print(ratings)
toc = time.time()
print("time elapsed:",(toc - tic))

Number of ratings: 10100
completed: 0
completed: 100
completed: 200
completed: 300
completed: 400
completed: 500
completed: 600
completed: 700
completed: 800
completed: 900
completed: 1000
completed: 1100
completed: 1200
completed: 1300
completed: 1400
completed: 1500
completed: 1600
completed: 1700
completed: 1800
completed: 1900
completed: 2000
completed: 2100
completed: 2200
completed: 2300
completed: 2400
completed: 2500


In [0]:
#print(np.count_nonzero(user_movies_train[:,0]))
#print(user_movies_log[1])
nonzero_indices

In [0]:
DON'T RUN!
#for getting x to test the graph
training_sample_size = 10
for user_row in range(0,training_sample_size):
    for movie_id, rating in enumerate(user_movies[user_row], start = 0):
        X=[]
        if rating>0:
            X = np.expand_dims(np.concatenate((user_movies[user_row], user_movies[:, movie_id])), axis=1)
            #print(X.shape)
            Y = rating
            break

## Building the model

In [0]:
def create_placeholders(n_x, n_y, m):
    
    x = tf.placeholder(tf.float32, [n_x,m])#497959
    y = tf.placeholder(tf.float32, [n_y,m])
    
    return x,y

def initialize_parameters(n_x, n_y):
    
    W1 = tf.get_variable("W1", [25, n_x], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b1 = tf.get_variable("b1", [25,1], initializer = tf.zeros_initializer())
    W2 = tf.get_variable("W2", [12, 25], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b2 = tf.get_variable("b2", [12,1], initializer = tf.zeros_initializer())
    W3 = tf.get_variable("W3", [n_y, 12], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b3 = tf.get_variable("b3", [n_y,1], initializer = tf.zeros_initializer())
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
    
    return parameters

def forward_propagation(x, parameters): 
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    Z1 = tf.add(tf.matmul(W1, x), b1)
    A1 = tf.nn.relu(Z1)
    Z2 = tf.add(tf.matmul(W2, A1), b2)
    A2 = tf.nn.relu(Z2)
    Z3 = tf.add(tf.matmul(W3, A2), b3)
    y_hat = Z3 #tf.minimum(5.0, tf.maximum(0.0, Z3))
    return y_hat

In [0]:
def stochastic_gradient_descent_model(num_epochs, training_sample_size, use_train_matrix):
    
    tf.reset_default_graph()

    x, y = create_placeholders(num_users + num_movies, 1, 1) #497959

    parameters = initialize_parameters(num_users + num_movies, 1) #497959

    y_hat = forward_propagation(x, parameters)

    #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tf.transpose(y_hat), labels=tf.transpose(y)))
    #print(y_hat.shape, y.shape)
    cost = tf.losses.mean_squared_error(y, y_hat)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
        beta1=0.9,
        beta2=0.999,
        epsilon=1e-08,
        use_locking=False,
        name='Adam').minimize(cost)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:

        sess.run(init)

        for epoch in range(num_epochs):
            
            print("epoch",epoch+1)
            epoch_cost = 0
            divisor = 0
            
            if use_train_matrix == False:
                for user_row in range(0,training_sample_size):
                    #print("user", user_row + 1)
                    for movie_id, rating in enumerate(user_movies[user_row]):
                        X=[]
                        if rating>0:
                            user_matrix = user_movies[user_row]
                            movie_matrix = user_movies[:, movie_id]
                            X = np.expand_dims(np.concatenate((user_matrix, movie_matrix)), axis=1)
                            #print(X.shape)
                            Y = np.expand_dims(np.expand_dims(rating, axis=1),axis=1)
                            #print(Y)
                            _ , cost_ = sess.run([optimizer,cost], feed_dict={x:X,y:Y})
                            epoch_cost += cost_
                            divisor += 1
                            break
                            
            elif use_train_matrix == True:
                for index, sparse_vector in enumerate(user_movies_train):
                    X = np.expand_dims(sparse_vector, axis=1)
                    Y = np.expand_dims(np.expand_dims(ratings[index], axis=1), axis=1)
                    _ , cost_ = sess.run([optimizer,cost], feed_dict={x:X,y:Y})
                    epoch_cost += cost_
                    divisor += 1
                    if (index>training_sample_size):
                        break
                        #pass
            
            epoch_cost /= divisor
            print("training loss:", epoch_cost,"\n")
            
        parameters = sess.run(parameters)
        return parameters
    
def test_stochastic_gradient_descent_model(parameters, test_sample_size, show_predictions, use_train_matrix):
    
    cost = 0
    test_sample_users = np.random.randint(count_nonzero_indices - training_sample_size, size=(test_sample_size,1)) + training_sample_size
    print(test_sample_users)
    
    for i in range(test_sample_size):
        
        j = test_sample_users[i][0]
        
        if use_train_matrix == False:
            
            user_vector = np.expand_dims(user_movies[users[j]], axis =1)
            #print(user_vector.shape)
            movie_vector = np.expand_dims(user_movies[:, movies[j]] , axis=1)
            #print(movie_vector.shape)
            X_predict = np.concatenate((user_vector, movie_vector), axis=0)
            X_predict = tf.cast(X_predict, tf.float32)
            
        elif use_train_matrix == True:
            
            X_predict = np.expand_dims(user_movies_train[j], axis=1)
            X_predict = tf.cast(X_predict, tf.float32)
            
        prediction = forward_propagation(X_predict, parameters)
        #actual_rating = user_movies_train_users[users[j], movies[j]]
        actual_rating = ratings[j]
        #actual_rating = expand_dims(actual_rating, axis=1)
        
        if show_predictions == 1:
            sess = tf.Session()
            print("prediction:", round(sess.run(prediction)[0][0]))
            sess.close()
            print("actual rating:", actual_rating,"\n")
        
        cost += tf.losses.mean_squared_error(actual_rating, prediction[0,0])
        
    cost /= test_sample_size
    return cost

In [0]:
#train
training_sample_size = 90
tic = time.time()
parameters = stochastic_gradient_descent_model(num_epochs = 20, training_sample_size = training_sample_size, use_train_matrix = True)
toc = time.time()
print("time elapsed:",(toc - tic))

epoch 1




training loss: 250.7673367938753 

epoch 2
training loss: 1226.9629463429521 

epoch 3
training loss: 78.28658373067763 

epoch 4
training loss: 31.603092637229405 

epoch 5
training loss: 18.061954540416085 

epoch 6
training loss: 12.418651474472743 

epoch 7
training loss: 9.896452686276975 

epoch 8
training loss: 9.599579187624444 

epoch 9
training loss: 7.889800747633791 

epoch 10
training loss: 6.978886456989288 

epoch 11
training loss: 6.271999527572897 

epoch 12
training loss: 5.721557129388004 

epoch 13
training loss: 5.962992019298405 

epoch 14
training loss: 6.121886809753841 

epoch 15
training loss: 5.342385108542816 

epoch 16
training loss: 4.885407329256692 

epoch 17
training loss: 4.512238105420218 

epoch 18
training loss: 4.245417038778016 

epoch 19
training loss: 3.9966724224251937 

epoch 20
training loss: 3.735746755173908 

time elapsed: 77.02656197547913


In [0]:
#test
cost = test_stochastic_gradient_descent_model(parameters, test_sample_size = 10, show_predictions = True, use_train_matrix = True)
sess = tf.Session()
print("test loss:", sess.run(cost))
sess.close()

[[90]
 [93]
 [90]
 [90]
 [98]
 [98]
 [95]
 [90]
 [98]
 [91]]
prediction: 5.0
actual rating: 5.0 

prediction: 3.0
actual rating: 3.0 

prediction: 5.0
actual rating: 5.0 

prediction: 5.0
actual rating: 5.0 

prediction: 2.0
actual rating: 4.0 

prediction: 2.0
actual rating: 4.0 

prediction: 1.0
actual rating: 4.0 

prediction: 5.0
actual rating: 5.0 

prediction: 2.0
actual rating: 4.0 

prediction: 1.0
actual rating: 3.0 

test loss: 3.2305684


In [0]:
#predict
user_id = 1025579
movie_id = 1

X_predict = np.expand_dims(np.concatenate((user_movies[user_dict[str(user_id)]], user_movies[:, movie_id - 1])), axis=1)
X_predict = tf.cast(X_predict, tf.float32)
prediction = forward_propagation(X_predict, parameters)

sess = tf.Session()
predicted_rating = sess.run(prediction)[0,0]
actual_rating = user_movies[user_dict[str(user_id)], movie_id - 1]
print("predicted rating:", predicted_rating)
print("actual rating:", actual_rating)
print("cost:", sess.run(tf.losses.mean_squared_error(actual_rating, predicted_rating)))
sess.close()

predicted rating: 0.6307114
actual rating: 4.0
cost: 11.352106


[4. 0. 0. ... 0. 0. 0.]


In [0]:
def minibatch_gradient_descent_model(minibatch_size = 50):
    
    tf.reset_default_graph()

    x, y = create_placeholders(num_users + num_movies, 1, minibatch_size)

    parameters = initialize_parameters()

    y_hat = forward_propagation(x, parameters)

    #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tf.transpose(y_hat), labels=tf.transpose(y)))
    print(y_hat.shape, y.shape)
    cost = tf.losses.mean_squared_error(y, y_hat)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
        beta1=0.9,
        beta2=0.999,
        epsilon=1e-08,
        use_locking=False,
        name='Adam').minimize(cost)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:

        sess.run(init)
        num_epochs = 10
        training_sample_size = 10

        for epoch in range(num_epochs):
            print("epoch",epoch)
            for user_row in range(0,training_sample_size):
                #print("user", user_row + 1)
                for movie_id, rating in enumerate(user_movies[user_row]):
                    X=[]
                    if rating>0:
                        user_matrix = np.transpose(user_movies[user_row:user_row + minibatch_size])
                        movie_matrix = user_movies[:, movie_id:movie_id + minibatch_size]
                        #print(movie_matrix)
                        print(user_matrix.shape, movie_matrix.shape)
                        X = np.concatenate((user_matrix, movie_matrix), axis=0)
                        #print(X.shape)
                        Y = np.expand_dims(np.expand_dims(rating, axis=1),axis=1)
                        #print(Y)
                        _ , cost_ = sess.run([optimizer,cost], feed_dict={x:X,y:Y})
                        break
            print("cost:", cost_)
        parameters = sess.run(parameters)
        return parameters

In [0]:
parameters = minibatch_gradient_descent_model(minibatch_size = 50)

(1, 50) (1, 50)
epoch 0
(17770, 50) (480189, 50)




ValueError: Cannot feed value of shape (1, 1) for Tensor 'Placeholder_1:0', which has shape '(1, 50)'

In [0]:
user_id = 321111
movie_id = 2

X_test = np.expand_dims(np.concatenate((user_movies[user_dict[str(user_id)]], user_movies[:, movie_id - 1])), axis=1)
X_test = tf.cast(X_test, tf.float32)
print(X_test)
prediction = forward_propagation(X_test, parameters)
sess = tf.Session()
print(np.math.floor(sess.run(prediction)))

Tensor("Cast_15:0", shape=(497959, 1), dtype=float32)
2
