# Movie recommender

In [1]:
#Importing libraries
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

#count of users, movies
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
'users =', n_users, 'movies =',  n_movies

('users =', 671, 'movies =', 9066)

In [5]:
#preparing data for embedding by indexing userids, moviesids
userid2idx = { o:i for i,o in enumerate(users) }
movieid2idx = {o:i for i,o in enumerate(movies) }

Updating users and movies column with continous indexes. This is the preparation for embedding. 

In [6]:
#embedding data
ratings.userId = ratings.userId.apply(lambda x:userid2idx[x])
ratings.movieId = ratings.movieId.apply(lambda x:movieid2idx[x])


In [7]:
#after data preprocessing necessary for embedding layers
#here original data for usedid and movied are replaced with continous integers 
#Note that 'rating' is not preprocesses and 'timestamp' is of little importance
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205


In [8]:
#defining latent factor for embedding layers
n_factors = 50
np.random.seed = 42

In [9]:
#splitting the data
msk = np.random.rand(len(ratings)) < 0.8
trn = ratings[msk]
val = ratings[~msk]

In [10]:
from keras.models import Model
from keras.layers import Dense,Input,Embedding,Dropout, merge, Flatten
from keras.regularizers import l2
from keras.optimizers import Adam

Using TensorFlow backend.


In [11]:
#Input layer
user_in = Input(shape=(1,), dtype='int64',name='user_in')
movie_in = Input(shape=(1,), dtype='int64', name='movies_in')

In [12]:
#embedding layer
u = Embedding(n_users, n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)
m = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)

  
  This is separate from the ipykernel package so we can avoid doing imports until


* I am using functional model and not the sequential model. 
* Documentation is available at FUNCTIONAL API in keras

In [13]:
x = merge([u,m], mode='concat')
x = Flatten()(x)
x = Dropout(0.3)(x)
x = Dense(70,activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(1)(x)
nn = Model([user_in,movie_in],x)


  """Entry point for launching an IPython kernel.
  name=name)


In [14]:
nn.compile(optimizer=Adam(0.01), loss='mse', metrics=['accuracy'])


In [15]:
nn.fit( [trn.userId, trn.movieId], trn.rating,
        epochs=2,
        batch_size=64,
        validation_data=([val.userId,val.movieId],val.rating)
      )

Train on 79940 samples, validate on 20064 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x222e9234cc0>

In [16]:
nn.compile(optimizer=Adam(0.001), loss='mse', metrics=['accuracy'])


In [17]:
nn.fit( [trn.userId, trn.movieId], trn.rating,
        epochs=2,
        batch_size=64,
        validation_data=([val.userId,val.movieId],val.rating)
      )

Train on 79940 samples, validate on 20064 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x222d546c588>

### Changing learning rate gave a bump in accuracy.
    In the previous model there was lot of dropouts (IDK why jeremy didnt give attention )
    But i am changing dropout values

In [19]:
x1 = merge([u,m], mode='concat')
x1 = Flatten()(x1)
x1 = Dense(70,activation='relu')(x1)
x1 = Dropout(0.5)(x1)
x1 = Dense(1)(x1)
nn = Model([user_in,movie_in],x1)

  """Entry point for launching an IPython kernel.
  name=name)


In [20]:
nn.compile(optimizer=Adam(0.001),loss='mse')

In [23]:
nn.fit(
        [trn.userId,trn.movieId],trn.rating,
        epochs=2,
        batch_size=64,
        validation_data=([val.userId,val.movieId],val.rating)
    )

Train on 79940 samples, validate on 20064 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x22301b48da0>

# its over fitting. 
Previous dropout values by jeremy were correct. XD

In [24]:
nn.save_weights('mov_rec.h5')

In [25]:
#Lets predict for user #5 and movie #10
nn.predict([np.array([5]), np.array([10]) ])

array([[ 2.61209059]], dtype=float32)