### 0. Data Loading

In [1]:
from tensorflow.keras.utils import get_file
movielens_url = ("http://files.grouplens.org/datasets/movielens/ml-latest-small.zip")
movielens_file = get_file("ml-latest-small.zip", movielens_url, extract=False)

In [2]:
from pathlib import Path
keras_path = Path(movielens_file).parents[0]
movielens_dir = keras_path / "ml-latest-small"

In [5]:
from zipfile import ZipFile
if not movielens_dir.exists():
    with ZipFile(movielens_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_path)
        print("Done!")

In [6]:
import pandas as pd
import numpy as np
ratings_file = movielens_dir / "ratings.csv"
df = pd.read_csv(ratings_file)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
pd.to_datetime(df['timestamp'], unit='s')

0        2000-07-30 18:45:03
1        2000-07-30 18:20:47
2        2000-07-30 18:37:04
3        2000-07-30 19:03:35
4        2000-07-30 18:48:51
                 ...        
100831   2017-05-03 21:53:22
100832   2017-05-03 22:21:31
100833   2017-05-08 19:50:47
100834   2017-05-03 21:19:12
100835   2017-05-03 21:20:15
Name: timestamp, Length: 100836, dtype: datetime64[ns]

### 1. Data Preparation

In [8]:
user_ids = df["userId"].unique().tolist()
user2idx = {x: i for i, x in enumerate(user_ids)}
idx2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2idx = {x: i for i, x in enumerate(movie_ids)}
idx2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["userId"].map(user2idx)
df["movie"] = df["movieId"].map(movie2idx)

num_users = len(user2idx)
num_movies = len(movie2idx)
df["rating"] = df["rating"].values.astype(np.float32)
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


In [12]:
x = df[["user", "movie"]].values
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

### 2. Modeling

In [14]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Input, concatenate, dot, Dropout, Dense, BatchNormalization
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

In [15]:
# Matrix Factorization (Simple NCF)

embed_size = 50

# Define inputs
movie_input = Input(shape=[1],name='movie-input')
user_input = Input(shape=[1], name='user-input')

# Embeddings
movie_embed = Embedding(num_movies + 1, embed_size, name='movie-embedding')(movie_input)
movie_vec = Flatten(name='flatten-movie')(movie_embed)

user_embed = Embedding(num_users + 1, embed_size, name='user-embedding')(user_input)
user_vec = Flatten(name='flatten-user')(user_embed)

# MatrixFactorization
pred_mf = dot([movie_vec, user_vec], axes=-1, name='pred-mf')

# prediction
mf_result = Dense(1, name='result', activation='relu')(pred_mf)

mf = Model([user_input, movie_input], mf_result)

In [16]:
mf.compile(optimizer=Adam(lr=0.01), loss=BinaryCrossentropy())
mf_history = mf.fit(
    x=[x[:,0], x[:,1]],
    y=y,
    batch_size=64,
    epochs=5, 
    validation_split=0.2,
    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
# Full NCF

embed_size = 50

# Define inputs
movie_input = Input(shape=[1],name='movie-input')
user_input = Input(shape=[1], name='user-input')

# MLP Embeddings
movie_embedding_mlp = Embedding(num_movies + 1, embed_size, name='movie-embedding-mlp')(movie_input)
movie_vec_mlp = Flatten(name='flatten-movie-mlp')(movie_embedding_mlp)

user_embedding_mlp = Embedding(num_users + 1, embed_size, name='user-embedding-mlp')(user_input)
user_vec_mlp = Flatten(name='flatten-user-mlp')(user_embedding_mlp)

# MF Embeddings
movie_embedding_mf = Embedding(num_movies + 1, embed_size, name='movie-embedding-mf')(movie_input)
movie_vec_mf = Flatten(name='flatten-movie-mf')(movie_embedding_mf)

user_embedding_mf = Embedding(num_users + 1, embed_size, name='user-embedding-mf')(user_input)
user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)

# MLP layers
concat = concatenate([movie_vec_mlp, user_vec_mlp], name='concat')
concat_dropout = Dropout(0.2)(concat)
fc_1 = Dense(100, name='fc-1', activation='relu')(concat_dropout)
fc_1_bn = BatchNormalization(name='batch-norm-1')(fc_1)
fc_1_dropout = Dropout(0.2)(fc_1_bn)
fc_2 = Dense(50, name='fc-2', activation='relu')(fc_1_dropout)
fc_2_bn = BatchNormalization(name='batch-norm-2')(fc_2)
fc_2_dropout = Dropout(0.2)(fc_2_bn)

# Prediction from both layers
pred_mlp = Dense(10, name='pred-mlp', activation='relu')(fc_2_dropout)
pred_mf = dot([movie_vec_mf, user_vec_mf], axes=-1, name='pred-mf')
combine_mlp_mf = concatenate([pred_mf, pred_mlp], name='combine-mlp-mf')

# Final prediction
result = Dense(1, name='result', activation='relu')(combine_mlp_mf)

NCF = Model([user_input, movie_input], result)
NCF.compile(optimizer=Adam(lr=0.01), loss=BinaryCrossentropy())

In [21]:
NCF_history = NCF.fit(
    x=[x[:,0], x[:,1]],
    y=y,
    batch_size=64,
    epochs=5, 
    validation_split=0.2,
    verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### 3. Recommendation

In [22]:
movie_df = pd.read_csv(movielens_dir / "movies.csv")
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
user_movie_array

array([[ 572, 5804],
       [ 572,    1],
       [ 572,  482],
       ...,
       [ 572, 3870],
       [ 572, 2989],
       [ 572, 7869]])

In [23]:
user_id = df.userId.sample(1).iloc[0]
movies_watched_by_user = df[df.userId == user_id]
movies_not_watched = movie_df[~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)]["movieId"]
movies_not_watched = list(set(movies_not_watched).intersection(set(movie2idx.keys())))
movies_not_watched = [[movie2idx.get(x)] for x in movies_not_watched]
user_encoder = user2idx.get(user_id)
user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched))
ratings = NCF.predict([user_movie_array[:,0],user_movie_array[:,1]]).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [movie2idx.get(movies_not_watched[x][0]) for x in top_ratings_indices]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

Showing recommendations for user: 573
Movies with high ratings from user
--------------------------------
Toy Story (1995) : Adventure|Animation|Children|Comedy|Fantasy
Liar Liar (1997) : Comedy
Batman: Mask of the Phantasm (1993) : Animation|Children
Predator (1987) : Action|Sci-Fi|Thriller
Lord of the Rings: The Return of the King, The (2003) : Action|Adventure|Drama|Fantasy
--------------------------------
Top 10 movie recommendations
--------------------------------
Screamers (1995) : Action|Sci-Fi|Thriller
Black Sheep (1996) : Comedy
Flirting With Disaster (1996) : Comedy
Burnt by the Sun (Utomlyonnye solntsem) (1994) : Drama
Little Buddha (1993) : Drama
