In [None]:
from theano.sandbox import cuda

In [None]:
%matplotlib inline
from __future__ import division, print_function
import math, os, json, sys, re
import numpy as np
import pandas as pd


In [None]:
# data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
#path = "data/ml-20m/"
path = "/Users/roland/dev/fastai/data/movielens/ml-latest-small/"
model_path = path + 'models/'
if not os.path.exists(model_path): os.mkdir(model_path)
batch_size=64

# Set up data

each row contains a rating of  a user for a movie

In [4]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
ratings.shape

(100004, 4)

We can also read in the movie names

In [6]:
movie_names = pd.read_csv(path+'movies.csv')
movie_names = movie_names.set_index('movieId')['title']
# movie_names

In [7]:
# Get the unique ids
users = ratings.userId.unique()
movies = ratings.movieId.unique()

In [8]:
# We construct reverse lookup tables with continuous indices for users and movies
# Continuous indices are necessary for embedding
userid2idx_dict = {value:index for index,value in enumerate(users)}
movieid2idx_dict = {value:index for index,value in enumerate(movies)}

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [10]:
ratings.userId = ratings.userId.apply(lambda x: userid2idx_dict[x])
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx_dict[x])


In [11]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205


In [12]:
user_min = ratings.userId.min()
user_max = ratings.userId.max()
movie_min = ratings.movieId.min()
movie_max = ratings.movieId.max()
(user_min, user_max, movie_min, movie_max)

(0, 670, 0, 9065)

In [13]:
# number of unique users == distincs
n_users = ratings.userId.nunique() 
n_movies = ratings.movieId.nunique()
(n_users, n_movies)

(671, 9066)

In [None]:
# Constants
n_factors = 30 # number of latent factors in each embedding
np.random.seed = 42

In [15]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(ratings, train_size=0.8)
train.shape, val.shape



((80003, 4), (20001, 4))

In [16]:
x_train = [train.userId, train.movieId]
y_train = train.rating
x_val = [val.userId, val.movieId]
y_val = val.rating

# Dot product

A basic model, is to create a dot product of the user embedding and the movie embedding.
Both embeddings are weights that are randomly initialized and then trained to lower the cost function.

In [17]:
from keras.models import Model
from keras.layers import Input, Embedding, merge, Flatten
from keras.regularizers import l2
from keras.optimizers import Adam

Using Theano backend.


In [18]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
# Q: Why is input_length=1 here??
user_embedding = Embedding(input_dim=n_users, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4), 
                           name='user_embedding') (user_in)

In [19]:
type(user_in)

theano.tensor.var.TensorVariable

In [20]:
type(user_embedding)

theano.tensor.var.TensorVariable

In [21]:
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
movie_embedding = Embedding(input_dim=n_movies, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4), 
                            name='movie_embedding') (movie_in)

In [22]:
# create the dot product
# merge is a functional merge, it applies to two tensors (NOT layers) and returns a tensor
# so x is of type TensorVariable
x = merge([user_embedding, movie_embedding], mode='dot')
type(x)

theano.tensor.var.TensorVariable

In [23]:
x = Flatten()(x)

In [24]:
type(x)

theano.tensor.var.TensorVariable

In [25]:
model = Model(input=[user_in, movie_in], output=x)
model.compile(optimizer=Adam(0.001), loss='mse')

In [26]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
movie_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
user_embedding (Embedding)       (None, 1, 50)         33550       user_in[0][0]                    
____________________________________________________________________________________________________
movie_embedding (Embedding)      (None, 1, 50)         453300      movie_in[0][0]                   
___________________________________________________________________________________________

In [29]:
model.fit(x_train, y_train, nb_epoch=1, validation_data=(x_val, y_val))

Train on 80003 samples, validate on 20001 samples
Epoch 1/1


<keras.callbacks.History at 0x1190fc8d0>

In [31]:
model.optimizer.lr=0.001
model.fit(x_train, y_train, nb_epoch=5, validation_data=(x_val, y_val))

Train on 80003 samples, validate on 20001 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1184bedd0>

# Bias

Result is not great. It can be that some bias variable is needed, to reflect the fast that some users will give much higher ratings on average ('enthusiastic users') or that some movies are rated better/worse on average.

The bias can be created with additional embeddings for each user and each movie, and adding it to the output.

In [54]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')

user_embedding = Embedding(input_dim=n_users, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4), 
                           name='user_embedding') (user_in)
movie_embedding = Embedding(input_dim=n_movies, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4), 
                            name='movie_embedding') (movie_in)

user_bias = Embedding(input_dim=n_users, output_dim=1, input_length=1, name='user bias')(user_in)
user_bias = Flatten() (user_bias)

movie_bias = Embedding(input_dim=n_movies, output_dim=1, input_length=1, name='movie bias')(movie_in)
movie_bias = Flatten() (movie_bias)

x = merge([user_embedding, movie_embedding], mode='dot')
x = Flatten()(x)
x = merge([x, user_bias], mode='sum')
x = merge([x, movie_bias], mode='sum')

model = Model(input=[user_in, movie_in], output=x)

model.compile(optimizer=Adam(0.001), loss='mse')

In [55]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
movie_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
user_embedding (Embedding)       (None, 1, 100)        67100       user_in[0][0]                    
____________________________________________________________________________________________________
movie_embedding (Embedding)      (None, 1, 100)        906600      movie_in[0][0]                   
___________________________________________________________________________________________

In [56]:
model.fit(x_train, y_train, nb_epoch=1, validation_data=(x_val, y_val))

Train on 80003 samples, validate on 20001 samples
Epoch 1/1


<keras.callbacks.History at 0x11c421f50>

In [57]:
model.optimizer.lr=0.001
model.fit(x_train, y_train, nb_epoch=5, validation_data=(x_val, y_val))

Train on 80003 samples, validate on 20001 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x117312a90>

In [58]:
model.optimizer.lr=0.0001
model.fit(x_train, y_train, nb_epoch=5, validation_data=(x_val, y_val))

Train on 80003 samples, validate on 20001 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x115fe8650>

In [52]:
model.fit(x_train, y_train, nb_epoch=5, validation_data=(x_val, y_val))

Train on 80003 samples, validate on 20001 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x119252d90>