### Packages

In [100]:
import pandas as pd
from IPython.display import display
import numpy as np
#from keras.layers import Dense, Flatten, Embedding, Input, Dropout, SpatialDropout1D, merge
from keras.layers import Dense, Flatten, Embedding, Input, Dropout 
from keras.layers import merge, dot, add, concatenate
from keras.models import Model
from keras.optimizers import SGD, RMSprop, Adam
from keras.regularizers import l2, l1 
from sklearn.metrics import mean_squared_error
from keras.callbacks import ModelCheckpoint, EarlyStopping

### Read data

In [101]:
# From http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
inputdir = "../data/ml-latest-small/"
ratings = inputdir + "ratings.csv"
movies = inputdir + "movies.csv"
modeldir = "./models/"

In [102]:
ratings_df = pd.read_csv(ratings)
movies_df = pd.read_csv(movies)

### Explore data and Prep data for embedding layer usage

In [103]:
display(ratings_df.head())
display(movies_df.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [104]:
ratings_df.shape, movies_df.shape

((100004, 4), (9125, 3))

In [105]:
# Get unique users and movie ids
moviesid = ratings_df['movieId'].unique()
usersid = ratings_df['userId'].unique()

In [106]:
# Create dict of movieid to its own index, and userid to its own index
movies2idx = {o:i for i,o in enumerate(moviesid)}
users2idx = {o:i for i,o in enumerate(usersid)}
len(movies2idx), len(users2idx)

(9066, 671)

In [107]:
# Convert the ids of movies and users to be continuous integers using above dict
# backup old ids
ratings_df['old_userId'] = ratings_df['userId']
ratings_df['old_movieId'] = ratings_df['movieId']
# new ids
ratings_df['userId'] = ratings_df['userId'].apply(lambda uid: users2idx[uid]) 
ratings_df['movieId'] = ratings_df['movieId'].apply(lambda mid: movies2idx[mid])

In [108]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,old_userId,old_movieId
0,0,0,2.5,1260759144,1,31
1,0,1,3.0,1260759179,1,1029
2,0,2,3.0,1260759182,1,1061
3,0,3,2.0,1260759185,1,1129
4,0,4,4.0,1260759205,1,1172


In [109]:
num_users =  ratings_df['userId'].nunique()
num_movies =  ratings_df['movieId'].nunique()
num_users, num_movies

(671, 9066)

In [110]:
# Check if there are 0 ratings
(ratings_df['rating'] < 0.1).sum()
#ratings_df[ratings_df['rating'] < 0.1]

0

### Prep data for model - split train and test sets

In [111]:
# Seed for np random
np.random.seed(50)

In [112]:
# Num of features for users and movies
num_features = 50

In [113]:
# Split to train and test set
cv_idx = np.random.rand(len(ratings_df)) < 0.8
ratings_train = ratings_df[cv_idx]
ratings_test = ratings_df[~cv_idx]

In [114]:
len(ratings_train), len(ratings_test)

(80205, 19799)

In [115]:
# Train inputs and outputs
x_u_train = ratings_train['userId']
x_m_train = ratings_train['movieId']
y_train = ratings_train['rating']

# Test inputs and outputs
x_u_test = ratings_test['userId']
x_m_test = ratings_test['movieId']
y_test = ratings_test['rating']

In [116]:
print(x_u_train.shape, x_m_train.shape, y_train.shape)
print(x_u_test.shape, x_m_test.shape, y_test.shape)

(80205,) (80205,) (80205,)
(19799,) (19799,) (19799,)


### Prep data for baseline model

In [117]:
# Create ratings to user matrix
ratings_u2m = np.zeros((num_users, num_movies)) 
for _ , rating in ratings_df.iterrows():
    u = rating['userId']
    m = rating['movieId']
    r = rating['rating']
    ratings_u2m[int(u)][int(m)] = r

In [118]:
# Number of ratings avail vs total
num_possible_ratings = len((ratings_u2m).ravel())
num_avail_ratings = np.count_nonzero(ratings_u2m.ravel())
print("num_avail_ratings, num_possible_ratings, sparsity of matrix = ", 
                              num_avail_ratings, num_possible_ratings, 
                              100. - num_avail_ratings*100./num_possible_ratings)

num_avail_ratings, num_possible_ratings, sparsity of matrix =  100004 6083286 98.35608583913366


In [119]:
# Create ratings to user matrix for test set
ratings_u2m_test = np.zeros((num_users, num_movies)) 
for _ , rating in ratings_test.iterrows():
    u = rating['userId']
    m = rating['movieId']
    r = rating['rating']
    ratings_u2m_test[int(u)][int(m)] = r

### Dataset details so far:
Num of users: 671  
Num of movies: 9066  
Num of ratings available: 100,004   

### Performance metric

In [120]:
def rmse(y_pred, y_true, matrix=False):
    if matrix:
        y_pred = y_pred[y_true.nonzero()].flatten()
        y_true = y_true[y_true.nonzero()].flatten()
    return np.sqrt(mean_squared_error(y_pred, y_true))

### Baseline model

In [121]:
# Num of movies per user, and users per movie
num_movies_rated_per_user = np.count_nonzero(ratings_u2m, axis = 1)
num_user_rating_per_movie = np.count_nonzero(ratings_u2m, axis = 0)
num_movies_rated_per_user.shape, num_user_rating_per_movie.shape

((671,), (9066,))

#### Avg user rating on existing movies 

In [154]:
avg_rating_of_user = ratings_u2m.sum(axis = 1) / num_movies_rated_per_user
ypred_avg_user = (np.zeros((num_users, num_movies)) + 
         avg_rating_of_user.reshape(num_movies_rated_per_user.shape[0],1))

#### Avg movie rating for existing users 

In [153]:
avg_rating_of_movie = ratings_u2m.sum(axis = 0) / num_user_rating_per_movie
ypred_avg_movie = (np.zeros((num_users, num_movies)) + 
         avg_rating_of_movie.reshape(num_user_rating_per_movie.shape[0],1).T)

#### Predict Avg of avg user rating and avg movie rating for new user/movie combo

In [155]:
ypred = (ypred_avg_user + ypred_avg_movie)/2.
#np.isnan(ypred).sum()
err = rmse(ypred, ratings_u2m_test, matrix = True)
print("RMSE of avg of avg user and avg movie rating model:",err)

RMSE of avg of avg user and avg movie rating model: 0.880672553613


### Collobarative filtering - Matrix dot product
Using embedding layer to find latent user/movie features

#### Define and fit model. No regularization

In [125]:
def coll_filter_model(num_users, num_movies, include_bias = True):

    # User 
    userid_input = Input(shape=(1,), dtype = 'int64', name='user_input')
    user_features = Embedding(input_dim = num_users, 
                              output_dim = num_features, 
                              input_length =1 )(userid_input)

    
    # Movie 
    movieid_input = Input(shape=(1,), dtype = 'int64', name='movie_input')
    movie_features = Embedding(input_dim = num_movies, 
                               output_dim = num_features, 
                               input_length = 1)(movieid_input)

    
    # Prediction
    ypred = dot([user_features, movie_features], axes = (2,2))
    ypred = Flatten()(ypred)
    
    
    # Include user and movie bias?
    if include_bias:
        
        # User bias
        user_bias = Embedding(input_dim = num_users, 
                              output_dim = 1, 
                              input_length =1 )(userid_input) 
        user_bias = Flatten()(user_bias)
        
        # Movie bias
        movie_bias = Embedding(input_dim = num_movies, 
                               output_dim = 1, 
                               input_length = 1)(movieid_input)    
        movie_bias = Flatten()(movie_bias)
        
        # Prediction
        ypred = add([ypred,user_bias])
        ypred = add([ypred,movie_bias])
        
    
    # Model
    model = Model(inputs = [userid_input, movieid_input], outputs = ypred)
    model.compile(optimizer = Adam(0.001), loss = 'mse')
    model.summary()
    return model



In [126]:
# Fit the model and make prediction
# No bias terms for movie and users
model = coll_filter_model(num_users, num_movies, include_bias = False)
validation_data = ([x_u_test, x_m_test], y_test)
checkpointer = EarlyStopping(patience=3, monitor='val_loss')
model.fit([x_u_train, x_m_train], y_train,
          validation_data = validation_data,
          #validation_split = 0.05, shuffle = True,
          callbacks=[checkpointer],
          batch_size = 128, epochs = 50)
# Predict and check performance
y_pred = model.predict([x_u_test, x_m_test])
err = rmse(y_pred, y_test)
print("RMSE of coll filter without user/movie bias:",err)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        33550       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        453300      movie_input[0][0]                
__________________________________________________________________________________________________
dot_1 (Dot

In [127]:
# Fit the model and make prediction
# Include bias terms for movie and users
model = coll_filter_model(num_users, num_movies, include_bias = True)
validation_data = ([x_u_test, x_m_test], y_test)
checkpointer = EarlyStopping(patience=3, monitor='val_loss')
model.fit([x_u_train, x_m_train], y_train,
          validation_data = validation_data,
          #validation_split = 0.05, shuffle = True,              
          callbacks=[checkpointer],
          batch_size = 128, epochs = 50)
# Predict and check performance
y_pred = model.predict([x_u_test, x_m_test])
err = rmse(y_pred, y_test)
print("RMSE of coll filter with user/movie bias:",err)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 50)        33550       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 50)        453300      movie_input[0][0]                
__________________________________________________________________________________________________
dot_2 (Dot

#### Define and fit model. Include regularization

In [128]:
# Include regularization
def coll_filter_model_reg(num_users, num_movies, include_bias = True):

    reg_factor = 1e-4
    bias_reg_factor = 0
    
    # User 
    userid_input = Input(shape=(1,), dtype = 'int64', name='user_input')
    user_features = Embedding(input_dim = num_users, 
                              output_dim = num_features, 
                              input_length =1,
                              embeddings_regularizer=l2(reg_factor))(userid_input)

    
    # Movie 
    movieid_input = Input(shape=(1,), dtype = 'int64', name='movie_input')
    movie_features = Embedding(input_dim = num_movies, 
                               output_dim = num_features, 
                               input_length = 1,
                               embeddings_regularizer=l2(reg_factor))(movieid_input)

    
    # Prediction
    ypred = dot([user_features, movie_features], axes = (2,2))
    ypred = Flatten()(ypred)
    
    
    # Include user and movie bias?
    if include_bias:
        
        # User bias
        user_bias = Embedding(input_dim = num_users, 
                              output_dim = 1, 
                              input_length =1,
                              embeddings_regularizer=l2(bias_reg_factor))(userid_input) 
        user_bias = Flatten()(user_bias)
        
        # Movie bias
        movie_bias = Embedding(input_dim = num_movies, 
                               output_dim = 1, 
                               input_length = 1,
                               embeddings_regularizer=l2(bias_reg_factor))(movieid_input)    
        movie_bias = Flatten()(movie_bias)
        
        # Prediction
        ypred = add([ypred,user_bias])
        ypred = add([ypred,movie_bias])
        
    
    # Model
    model = Model(inputs = [userid_input, movieid_input], outputs = ypred)
    model.compile(optimizer = Adam(0.001), loss = 'mse')
    model.summary()
    return model

In [129]:
# Fit the model and make prediction
# No bias terms for movie and users
model = coll_filter_model_reg(num_users, num_movies, include_bias = False)
validation_data = ([x_u_test, x_m_test], y_test)
checkpointer = EarlyStopping(patience=3, monitor='val_loss')
model.fit([x_u_train, x_m_train], y_train,
          validation_data = validation_data,
          #validation_split = 0.05, shuffle = True,
          callbacks=[checkpointer],
          batch_size = 128, epochs = 50)
# Predict and check performance
y_pred = model.predict([x_u_test, x_m_test])
err = rmse(y_pred, y_test)
print("RMSE of coll filter without user/movie bias:",err)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 50)        33550       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 1, 50)        453300      movie_input[0][0]                
__________________________________________________________________________________________________
dot_3 (Dot

In [130]:
# Fit the model and make prediction
# No bias terms for movie and users
model = coll_filter_model_reg(num_users, num_movies, include_bias = True)
validation_data = ([x_u_test, x_m_test], y_test)
checkpointer = EarlyStopping(patience=3, monitor='val_loss')
model.optimizer.lr = 0.001
model.fit([x_u_train, x_m_train], y_train,
          validation_data = validation_data,
          #validation_split = 0.2, shuffle = True,
          callbacks=[checkpointer],
          batch_size = 128, epochs = 100)
# Predict and check performance
y_pred = model.predict([x_u_test, x_m_test])
err = rmse(y_pred, y_test)
print("RMSE of coll filter with user/movie bias terms:",err)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 50)        33550       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 1, 50)        453300      movie_input[0][0]                
__________________________________________________________________________________________________
dot_4 (Dot

Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
RMSE of coll filter with user/movie bias terms: 0.902155908516


### Dense neural network  
Using embedding layer to find latent user/movie features

In [131]:
# Include regularization
def nn_model_reg(num_users, num_movies, include_bias = True):

    reg_factor = 1e-4
    bias_reg_factor = 0
    
    # User 
    userid_input = Input(shape=(1,), dtype = 'int64', name='user_input')
    user_features = Embedding(input_dim = num_users, 
                              output_dim = num_features, 
                              input_length =1,
                              embeddings_regularizer=l2(reg_factor))(userid_input)

    
    # Movie 
    movieid_input = Input(shape=(1,), dtype = 'int64', name='movie_input')
    movie_features = Embedding(input_dim = num_movies, 
                               output_dim = num_features, 
                               input_length = 1,
                               embeddings_regularizer=l2(reg_factor))(movieid_input)

    
    # Include user and movie bias?
    if include_bias:
        
        # User bias
        user_bias = Embedding(input_dim = num_users, 
                              output_dim = 1, 
                              input_length =1,
                              embeddings_regularizer=l2(bias_reg_factor))(userid_input) 
        
        # Movie bias
        movie_bias = Embedding(input_dim = num_movies, 
                               output_dim = 1, 
                               input_length = 1,
                               embeddings_regularizer=l2(bias_reg_factor))(movieid_input)    

        # Prediction with bias terms
        ypred = concatenate([user_features, movie_features, user_bias, movie_bias])
    
    else:
        
        # Prediction without bias terms
        ypred = concatenate([user_features, movie_features])
        
    # Prediction model remaining for NN    
    ypred = Flatten()(ypred)
    ypred = Dropout(0.2)(ypred)
    ypred = Dense(64, activation = 'relu')(ypred)
    ypred = Dropout(0.2)(ypred)
    ypred = Dense(1)(ypred)
    
    # Model
    model = Model(inputs = [userid_input, movieid_input], outputs = ypred)
    model.compile(optimizer = Adam(0.001), loss = 'mse')
    model.summary()
    return model

In [132]:
# Fit the model and make prediction
# No bias terms for movie and users
model = nn_model_reg(num_users, num_movies, include_bias = False)
validation_data = ([x_u_test, x_m_test], y_test)
checkpointer = EarlyStopping(patience=3, monitor='val_loss')
model.fit([x_u_train, x_m_train], y_train,
          validation_data = validation_data,
          #validation_split = 0.05, shuffle = True,
          callbacks=[checkpointer],
          batch_size = 128, epochs = 50)
# Predict and check performance
y_pred = model.predict([x_u_test, x_m_test])
err = rmse(y_pred, y_test)
print("RMSE of coll filter without user/movie bias:",err)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 1, 50)        33550       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 1, 50)        453300      movie_input[0][0]                
__________________________________________________________________________________________________
concatenat

In [133]:
# Fit the model and make prediction
# No bias terms for movie and users
model = nn_model_reg(num_users, num_movies, include_bias = True)
validation_data = ([x_u_test, x_m_test], y_test)
checkpointer = EarlyStopping(patience=3, monitor='val_loss')
model.fit([x_u_train, x_m_train], y_train,
          validation_data = validation_data,
          #validation_split = 0.05, shuffle = True,
          callbacks=[checkpointer],
          batch_size = 128, epochs = 50)
# Predict and check performance
y_pred = model.predict([x_u_test, x_m_test])
err = rmse(y_pred, y_test)
print("RMSE of coll filter without user/movie bias:",err)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 1, 50)        33550       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 1, 50)        453300      movie_input[0][0]                
__________________________________________________________________________________________________
embedding_

In [135]:
# Save model:
nn_weights =  modeldir+'nn_with_bias.h5' 
model.save_weights(nn_weights)

### Make prediction

#### Load weights

In [136]:
nn_weights =  modeldir+'nn_with_bias.h5' 
model.load_weights(nn_weights)

#### Predict rating for user and movie

In [152]:
userid = 20
movieid = 1000
userid_in = np.array([userid])
movieid_in = np.array([movieid])
rating = model.predict([userid_in, movieid_in])
print("Predicted rating is:",rating[0][0])

Predicted rating is: 3.56581
