In [1]:
import os
import pandas as pd
import numpy as np
PATH = os.getcwd()
os.chdir(PATH)

In [2]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout
from keras.layers import Input, concatenate
from keras import regularizers, initializers
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  % self._get_c_name())


In [3]:
rating=pd.read_csv("ratings.csv")
movie = pd.read_csv("movies.csv")

In [4]:
print(rating.head())
print(rating.dtypes)
print(movie.head())
print(movie.dtypes)

   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
movieId     int64
title      object
genres     object
dtype: object


###### Convert attributes to right data types

In [5]:
rating = rating.merge(movie,on='movieId',how='inner')

In [6]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama


In [7]:
rating.userId = rating.userId.astype("category")
rating.movieId = rating.movieId.astype("category")
rating.genres = rating.genres.astype("category")

In [8]:
rating.dtypes

userId       category
movieId      category
rating        float64
timestamp       int64
title          object
genres       category
dtype: object

###### Check for Missing Values

In [9]:
rating.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [10]:
np.unique(rating.movieId.values)[0:100]

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  52,  53,  54,
        55,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  68,  69,
        70,  71,  72,  73,  74,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  92,  93,  94,  95,  96,  97,  98,
        99, 100, 101, 102, 103, 104, 105, 107, 108])

In [11]:
np.unique(rating.genres.values)[0:10]

array(['(no genres listed)', 'Action', 'Action|Adventure',
       'Action|Adventure|Animation',
       'Action|Adventure|Animation|Children',
       'Action|Adventure|Animation|Children|Comedy',
       'Action|Adventure|Animation|Children|Comedy|Fantasy',
       'Action|Adventure|Animation|Children|Comedy|IMAX',
       'Action|Adventure|Animation|Children|Comedy|Romance',
       'Action|Adventure|Animation|Children|Comedy|Sci-Fi'], dtype=object)

In [12]:
userid = rating.userId.cat.codes.values
movieid = rating.movieId.cat.codes.values
genreid = rating.genres.cat.codes.values

In [13]:
np.unique(genreid)[0:100]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=int16)

In [14]:
n_users = len(rating.userId.unique())
n_movies = len(rating.movieId.unique())
n_genres = len(rating.genres.unique())

###### Adding Cat Code Values to original dataframe

In [15]:
rating["userid_catcode"]=userid
rating["movieid_catcode"]=movieid
rating["genreid_catcode"]=genreid

In [16]:
embedding_dim = 50

###### Define Metrics

In [17]:
from keras import backend as K
def mape_error(y_true, y_pred): 
    return K.mean((K.abs(y_pred - y_true)/y_true), axis=0) * 100

In [18]:
def rmse_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=0)) 

In [19]:
def mse_error(y_true, y_pred):
        return K.mean(K.square(y_pred - y_true), axis=0) 

###### Split the data to Train and Test datasets

In [20]:
train_userid, test_userid, \
train_movieid, test_movieid, \
train_genreid, test_genreid, \
train_y, test_y = train_test_split(userid,
                                   movieid, 
                                   genreid,
                                   rating.rating, 
                                   test_size=0.3, random_state=2)

In [21]:
train_y.shape

(70002,)

In [22]:
train_y = train_y.reshape((-1,1))
test_y = test_y.reshape((-1,1))

  """Entry point for launching an IPython kernel.
  


In [23]:
train_y.shape
train_y[0:10]

array([[4.5],
       [3. ],
       [1. ],
       [5. ],
       [4. ],
       [2.5],
       [2. ],
       [2. ],
       [2. ],
       [5. ]])

###### Embedding UserIds

In [24]:
encoder_UserID = Sequential()
encoder_UserID.add(Embedding(n_users, embedding_dim, input_length=1,embeddings_regularizer=regularizers.l2(0.00001)))

###### Embedding MovieIds

In [25]:
encoder_MovieID = Sequential()
encoder_MovieID.add(Embedding(n_movies, embedding_dim, input_length=1,embeddings_regularizer=regularizers.l2(0.00001)))

###### Embedding Genreids

In [26]:
encoder_genreID = Sequential()
encoder_genreID.add(Embedding(n_genres, embedding_dim, input_length=1,embeddings_regularizer=regularizers.l2(0.00001)))


##### Define MLP 

In [27]:
user_inp = Input(shape=(1, ))
user_mbd = Embedding(n_users, embedding_dim)(user_inp)

movie_inp = Input(shape=(1, ))
movie_mbd = Embedding(n_movies, embedding_dim)(movie_inp)

genre_inp = Input(shape=(1, ))
genre_mbd = Embedding(n_genres, embedding_dim)(genre_inp)

merged = concatenate([user_mbd, movie_mbd, genre_mbd])
fc1 = Dense(100,activation='relu')(merged)
fc2 = Dense(1)(fc1)

model = Model(inputs=[user_inp, movie_inp, genre_inp], outputs=fc2)
model.compile(optimizer='adam', loss='mse', metrics=[mse_error,rmse_error])

###### Train and fit the model

In [28]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 50)        33550       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_

In [29]:
model.fit([train_userid,train_movieid,train_genreid], train_y.reshape(-1,1,1), epochs=100, verbose=1,batch_size=500,validation_data=([test_userid,test_movieid,test_genreid], test_y.reshape(-1,1,1)))

Train on 70002 samples, validate on 30002 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x8e7bd50>

###### Evaluating Scores Manually

In [30]:
scores= model.evaluate([test_userid,test_movieid,test_genreid],test_y.reshape(-1,1,1))
print(scores)

[1.2928255820987973, 1.2928255820987973, 1.1235705148251118]


In [31]:
test_pred = model.predict([test_userid,test_movieid,test_genreid])


In [32]:
mse_score = np.mean(np.square(test_pred.reshape(-1,1) - test_y))
print("mse_score:",mse_score)

('mse_score:', 1.2928255815504432)


###### Calculate MSE and RMSE Manually

In [33]:
rmse_score = np.sqrt(np.mean(np.square(test_pred.reshape(-1,1) - test_y)))
print("rmse_score:",rmse_score)


('rmse_score:', 1.1370248816760533)


## Developing Recommendations

###### Extracting/Predicting user embeddings

In [34]:
encoder_UserID = Model(user_inp, user_mbd)
encoder_MovieID = Model(movie_inp, movie_mbd)

In [35]:
users_unique = np.unique(userid)
users_embeddings=encoder_UserID.predict(users_unique)

In [36]:
users_unique[0:3]
users_embeddings[0:3]

array([[[-0.14086787,  0.00214148,  0.14849137, -0.08695845,
         -0.11366415, -0.07536191,  0.03048269, -0.06757527,
         -0.1380844 , -0.12848665,  0.00480648,  0.11459301,
          0.05878794,  0.24983545,  0.03425464, -0.12878408,
          0.03148203, -0.22421207, -0.02240596, -0.02806012,
          0.00696475, -0.12247404,  0.00333499, -0.11658289,
         -0.01340949,  0.03981587, -0.0359396 , -0.03208052,
          0.08636633,  0.00800152, -0.01008686, -0.09014945,
         -0.16228247,  0.04108253, -0.07639263,  0.01992039,
         -0.04631804,  0.16495918,  0.09630899,  0.07973687,
         -0.07512137,  0.01180911, -0.0308159 , -0.10517716,
          0.1682747 ,  0.07351975, -0.09092321, -0.09659732,
          0.12119636,  0.03509335]],

       [[ 0.16984797,  0.3684395 , -0.03779903, -0.12919275,
          0.3290781 , -0.22662263,  0.01810695,  0.137647  ,
          0.31690207, -0.03722298, -0.03519651,  0.14700103,
         -0.1648044 , -0.07016513,  0.22030507,

In [37]:
users_embeddings=pd.DataFrame(users_embeddings.reshape(-1,50))
users_embeddings["userid_catcode"] = users_unique

In [38]:
users_embeddings[0:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,userid_catcode
0,-0.140868,0.002141,0.148491,-0.086958,-0.113664,-0.075362,0.030483,-0.067575,-0.138084,-0.128487,...,0.011809,-0.030816,-0.105177,0.168275,0.07352,-0.090923,-0.096597,0.121196,0.035093,0
1,0.169848,0.368439,-0.037799,-0.129193,0.329078,-0.226623,0.018107,0.137647,0.316902,-0.037223,...,0.230675,0.060416,-0.083827,0.06191,-0.053642,0.033089,-0.025814,-0.051868,-0.234572,1
2,0.186259,0.086003,0.012189,0.055322,0.223343,0.036501,-0.072419,0.053799,0.104197,0.123442,...,0.027048,0.303685,0.032594,0.086782,-0.128121,-0.005186,0.00584,0.032437,-0.12811,2


In [39]:
movies_unique = np.unique(movieid)
movies_embeddings=encoder_MovieID.predict(movies_unique)

In [40]:
movies_embeddings=pd.DataFrame(movies_embeddings.reshape(-1,50))
movies_embeddings["movieid_catcode"] = movies_unique

##### Identify nearest neighbours for a movie based on k-Nearest Neighbours Algorithm

In [41]:
from sklearn.neighbors import NearestNeighbors

In [42]:
nbrs = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(movies_embeddings.drop(["movieid_catcode"],axis=1))
movie_distances, movie_nbrs = nbrs.kneighbors(movies_embeddings.drop(["movieid_catcode"],axis=1))
movie_nbrs=pd.DataFrame(movie_nbrs)
movie_nbrs.columns=["movieid_catcode","NN1","NN2","NN3","NN4","NN5","NN6","NN7","NN8","NN9"]


###### Define Recommender Function

In [46]:
def recommender(rating,movie_nbrs,model,uid,n):
    
    # Mapping the userid to its Cat Code
    uid_catcode=rating[rating.userId==uid]["userid_catcode"].unique()
    
    #Subsetting the records of the userid
    movies=(rating[rating.userid_catcode==uid_catcode[0]])
    
    #Sorting the user movies based on given ratings
    if len(movies)>=n :
        movies=movies.sort_values("rating",ascending=False)
        movies=list((movies[0:n]["movieid_catcode"]))
    else :
        movies=list(movies["movieid_catcode"])
    
   
    Movie_nbrs=movie_nbrs[movie_nbrs.movieid_catcode.isin(movies)]
    Movie_nbrs=Movie_nbrs.drop("movieid_catcode",axis=1)
    Movie_nbrs=pd.DataFrame(np.unique(Movie_nbrs.values.flatten()))
    Movie_nbrs.columns=["movieid_catcode"]
    Movie_nbrs=Movie_nbrs[~(Movie_nbrs["movieid_catcode"].isin(list(rating[rating.userid_catcode==uid_catcode[0]]["movieid_catcode"])))]
    genre_nbrs=rating[rating.movieid_catcode.isin(Movie_nbrs.movieid_catcode)]["genreid_catcode"]
    genre_nbrs.columns=["genre_id"]
    
    
    if len(Movie_nbrs)>0:
        usid=np.repeat(uid_catcode[0], len(Movie_nbrs))
        Movie_nbrs=Movie_nbrs.movieid_catcode.values
        genre_nbrs = genre_nbrs.values
        test_pred = model.predict([usid,Movie_nbrs,genre_nbrs]).reshape(-1)
        
        r_Df=pd.DataFrame({"movieid_catcode":Movie_nbrs})
        r_Df["rating"]=test_pred
        
        Movie_catcodes=rating[["movieId","movieid_catcode","title"]]
        Movie_catcodes=Movie_catcodes.drop_duplicates()
        r_Df=pd.merge(r_Df,Movie_catcodes,how="left",on=["movieid_catcode"])
        r_Df=r_Df.sort_values("rating",ascending=False)
        r_Df=r_Df.iloc[0:n]
        r_Df["userId"] = uid
        return(r_Df)
    else :
        print("Try with another n value")
        r_Df=None
        return(r_Df)

###### Recommend movie for a user

In [50]:
uid=605
n=5
model = model
rec=recommender(rating,movie_nbrs,model,uid,n)

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ValueError: "Buffer dtype mismatch, expected 'Python object' but got 'long'" in 'pandas._libs.lib.is_bool_array' ignored


In [51]:
rec

Unnamed: 0,movieid_catcode,rating,movieId,title,userId
38,7942,5.600111,91690,Friends with Kids (2011),605
25,5425,5.498619,8511,"Immigrant, The (1917)",605
40,8034,5.083377,94939,Sound of Noise (2010),605
32,6703,4.834203,53883,"Power of Nightmares, The: The Rise of the Poli...",605
42,8282,4.67099,102123,This Is the End (2013),605
