In [1]:
import numpy as np
import keras.backend as K
from keras.regularizers import l2
import keras
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, GRU, Embedding, Bidirectional, Flatten, Dropout, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from keras.callbacks import History ,ModelCheckpoint, EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import add, dot, concatenate, multiply
import numpy as np
from keras.layers import Dot
from keras.initializers import Zeros
import random
import pandas as pd
from sklearn.preprocessing import scale,MinMaxScaler
from sklearn.utils import shuffle
from keras.utils import get_custom_objects
import matplotlib.pyplot as plt
% matplotlib inline
%env CUDA_VISIBLE_DEVICES=2
import pickle
import seaborn as sns
sns.set(style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)


Using TensorFlow backend.


env: CUDA_VISIBLE_DEVICES=2


In [6]:
import os
cur_dir = os.getcwd()


In [7]:
train_data = pd.read_csv( os.path.join(cur_dir,'dataset','train.csv'), sep=',' )
test_data = pd.read_csv( os.path.join(cur_dir,'dataset','test.csv'), sep=',' )

print("train data shape",train_data.shape)
print("test data shape", test_data.shape)

all_UMpair = pd.concat([train_data,test_data])[["UserID","MovieID"]]

# create dictionary to encode UserID and MovieID
user2id = {}
movie2id = {}
for i, UserID in enumerate(np.unique(np.concatenate((train_data["UserID"],test_data["UserID"])))):
    user2id[UserID] = i
for j, MovieID in enumerate(np.unique(np.concatenate((train_data["MovieID"],test_data["MovieID"])))):
    movie2id[MovieID] = j
num_user = len(user2id)
num_movie = len(movie2id)
print("number of user",num_user)
print("number of movie",num_movie)

train data shape (899873, 4)
test data shape (100336, 3)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


number of user 6040
number of movie 3706


In [8]:
train_user = np.array([ user2id[i] for i in train_data["UserID"]])
train_movie = np.array([ movie2id[j] for j in train_data["MovieID"]])
train_rating = np.array(train_data["Rating"])

total_sample_size =  len(train_rating)
print("total train sample",total_sample_size)

total train sample 899873


In [9]:
def MF_model(n_users, n_items, latent_dim=128,bias=True,normalize=False):
    if normalize:
        def rmse(y_true, y_pred):
            y_true = y_true*1.116897661+3.58171208
            y_pred = y_pred*1.116897661+3.58171208
            y_pred = K.clip(y_pred, 1.0, 5.0)
            return K.sqrt(K.mean(K.pow(y_true - y_pred, 2)))
    else:
        def rmse(y_true, y_pred):
            y_pred = K.clip(y_pred, 1.0, 5.0)
            return K.sqrt(K.mean(K.pow(y_true - y_pred, 2)))
    get_custom_objects().update({"rmse": rmse})
    user_input = Input(shape=[1])
    item_input = Input(shape=[1])
    user_vec = Embedding(n_users, latent_dim, embeddings_initializer="random_normal")(user_input)
    user_vec = Flatten()(user_vec)
    item_vec = Embedding(n_items, latent_dim, embeddings_initializer="random_normal")(item_input)
    item_vec = Flatten()(item_vec)
    #bias
    
    
    r_hat = dot([user_vec,item_vec],axes=1)
    if bias:
        print("Bias")
        user_bias = Embedding(n_users,1, embeddings_initializer="zeros")(user_input)
        user_bias = Flatten()(user_bias)
        item_bias = Embedding(n_items, 1, embeddings_initializer="zeros")(item_input)
        item_bias = Flatten()(item_bias)
        r_hat = add([r_hat, user_bias, item_bias])
    model = Model([user_input,item_input],r_hat)
    model.compile(loss="mse", optimizer="adam", metrics=[rmse])
    return model

In [13]:
batch_size = 256
epochs = 10
MF = MF_model(num_user,num_movie,latent_dim=20,bias=False)
MF.summary()
train_user_, train_movie_, train_rating_ = shuffle(train_user, train_movie, train_rating)
hist = History()
early_stop = EarlyStopping(monitor="val_rmse", patience=3)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 20)        120800      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 20)        74120       input_4[0][0]                    
__________________________________________________________________________________________________
flatten_3 

In [16]:
MF.fit([train_user_, train_movie_], train_rating_,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,callbacks=[ hist])

Train on 809885 samples, validate on 89988 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [12]:
movie_df = pd.read_csv( os.path.join(cur_dir,'dataset','movies.csv'), sep='::' )
movie_df["Genres"] = movie_df["Genres"].apply(lambda x:x.split("|")[0])
print(movie_df.head())
user_emb = np.array(MF.layers[2].get_weights()).squeeze()
print("user embeddign shape:", user_emb.shape)
movie_emb = np.array(MF.layers[3].get_weights()).squeeze()
print("movie embedding shape:", movie_emb.shape)

  """Entry point for launching an IPython kernel.


   movieID                               Title     Genres
0        1                    Toy Story (1995)  Animation
1        2                      Jumanji (1995)  Adventure
2        3             Grumpier Old Men (1995)     Comedy
3        4            Waiting to Exhale (1995)     Comedy
4        5  Father of the Bride Part II (1995)     Comedy
user embeddign shape: (6040, 20)
movie embedding shape: (3706, 20)
