In [1]:
import os, sys
import numpy as np
current_dir = os.getcwd()
DATA_HOME_DIR = current_dir+'/data/ml-small/'

In [2]:
import importlib
import utils; importlib.reload(utils)
from utils import *
from __future__ import division, print_function
%matplotlib inline

Using cuDNN version 7003 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:01:00.0)
Using Theano backend.


In [3]:
path = DATA_HOME_DIR
model_path = path+'models/'
if not os.path.exists(model_path): os.mkdir(model_path)
batch_size = 64

In [4]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

In [6]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

In [7]:
len (ratings)

100004

In [8]:
userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

In [9]:
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205


In [11]:
user_min, user_max, movie_min, movie_max = (ratings.userId.min(), 
    ratings.userId.max(), ratings.movieId.min(), ratings.movieId.max())
user_min, user_max, movie_min, movie_max

(0, 670, 0, 9065)

In [12]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
n_users, n_movies

(671, 9066)

In [13]:
#Latent factor in each embedding
n_factors = 50

In [14]:
np.randomseed=42

In [15]:
msk = np.random.rand(len(ratings)) < 0.8
trn = ratings[msk]
val = ratings[~msk]

## Basic dot product model - Using Keras functional 

In [16]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=l2(1e-4))(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=l2(1e-4))(movie_in)

In [22]:
x = merge([u, m], mode='dot')
x = Flatten()(x)
# Model(input, output)
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

  """Entry point for launching an IPython kernel.
  name=name)


In [23]:
 model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, epochs=10,
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79835 samples, validate on 20169 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fcbb63d1048>

## Adding Bias term

In [18]:
u

Reshape{3}.0

In [15]:
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)

In [22]:
user_in, u = embedding_input('user_in', n_users, n_factors, 1e-6)
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-6)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)

In [24]:
ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)

In [25]:
x = merge([u, m], mode='dot')
x = Flatten()(x)
x = merge([x, ub], mode='sum')
x = merge([x, mb], mode='sum')
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

  """Entry point for launching an IPython kernel.
  name=name)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [33]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, epochs=10,
        validation_data=([val.userId, val.movieId], val.rating))

Train on 79835 samples, validate on 20169 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fcbb2260d30>

In [34]:
model.optimizer.lr=0.01
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=6, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79835 samples, validate on 20169 samples
Epoch 1/6
 8192/79835 [==>...........................] - ETA: 1s - loss: 1.2661

  This is separate from the ipykernel package so we can avoid doing imports until


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fcbb1f2f9b0>

In [26]:
model.optimizer.lr=0.001
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=15, 
          validation_data=([val.userId, val.movieId], val.rating))

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 79945 samples, validate on 20059 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f13ce53f0f0>

In [36]:
model.save_weights(model_path+'bias.h5')

In [21]:
model.load_weights(model_path+'bias.h5')