## I created this toy movie recommender in kera to illustrate how to use collobrative filtering and neural networks to suggest relevant movies which fit specific users' tastes. I added detailed comments for each line of codes to assist understanding.

### Author: Fiona Wu

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from zipfile import ZipFile

import keras
from keras import layers
from keras import ops

In [3]:
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
# Use the ratings.csv file
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
df = pd.read_csv(ratings_file)

In [4]:
# Extract unique user IDs from the DataFrame and convert to a list
user_ids = df["userId"].unique().tolist()

# Create a dictionary to map each user ID to a unique integer (encoding)
user2user_encoded = {x: i for i, x in enumerate(user_ids)}

# Create a reverse dictionary to map the unique integer (encoding) back to the user ID
userencoded2user = {i: x for i, x in enumerate(user_ids)}

# Extract unique movie IDs from the DataFrame and convert to a list
movie_ids = df["movieId"].unique().tolist()

# Create a dictionary to map each movie ID to a unique integer (encoding)
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}

# Create a reverse dictionary to map the unique integer (encoding) back to the movie ID
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

# Add a new column 'user' to the DataFrame, mapping the original user IDs to their encoded values
df["user"] = df["userId"].map(user2user_encoded)

# Add a new column 'movie' to the DataFrame, mapping the original movie IDs to their encoded values
df["movie"] = df["movieId"].map(movie2movie_encoded)

# Calculate the number of unique users and movies
num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)

# Ensure that the 'rating' column is of type float32
df["rating"] = df["rating"].values.astype(np.float32)

# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

# Print the number of users, number of movies, minimum rating, and maximum rating
print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)


Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0


In [5]:
# Shuffle the DataFrame rows and set a random seed for reproducibility
df = df.sample(frac=1, random_state=42)

# Extract the 'user' and 'movie' columns as the input features
x = df[["user", "movie"]].values

# Normalize the targets (ratings) between 0 and 1 for easier training
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

# Assuming training on 90% of the data and validating on 10%
train_indices = int(0.9 * df.shape[0])

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = (
    x[:train_indices],  # Features for training
    x[train_indices:],  # Features for validation
    y[:train_indices],  # Targets for training
    y[train_indices:],  # Targets for validation
)


In [6]:
# Size of the embedding vectors for users and movies
EMBEDDING_SIZE = 50

# Define the RecommenderNet class, a subclass of keras.Model
class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super().__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        
        # User embedding layer with regularization
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",  # He initialization for embedding weights
            embeddings_regularizer=keras.regularizers.l2(1e-6),  # L2 regularization
        )
        # User bias embedding layer
        self.user_bias = layers.Embedding(num_users, 1)
        
        # Movie embedding layer with regularization
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",  # He initialization for embedding weights
            embeddings_regularizer=keras.regularizers.l2(1e-6),  # L2 regularization
        )
        # Movie bias embedding layer
        self.movie_bias = layers.Embedding(num_movies, 1)

    # Define the forward pass through the network
    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])  # Embedding lookup for users
        user_bias = self.user_bias(inputs[:, 0])  # Bias lookup for users
        movie_vector = self.movie_embedding(inputs[:, 1])  # Embedding lookup for movies
        movie_bias = self.movie_bias(inputs[:, 1])  # Bias lookup for movies
        
        # Compute dot product of user and movie embeddings
        dot_user_movie = ops.tensordot(user_vector, movie_vector, 2)
        
        # Add all the components (dot product and biases)
        x = dot_user_movie + user_bias + movie_bias
        
        # Apply sigmoid activation to constrain ratings between 0 and 1
        return ops.nn.sigmoid(x)

# Instantiate the RecommenderNet model with specified parameters
model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)

# Compile the model with BinaryCrossentropy loss and Adam optimizer
model.compile(
    loss=keras.losses.BinaryCrossentropy(),  # Binary cross-entropy loss for rating prediction
    optimizer=keras.optimizers.Adam(learning_rate=0.001),  # Adam optimizer with learning rate 0.001
)


In [7]:
# Train the model on the training data
history = model.fit(
    x=x_train,  # Features for training
    y=y_train,  # Target ratings for training
    batch_size=64,  # Number of samples per gradient update
    epochs=5,  # Number of complete passes through the training dataset
    verbose=1,  # Verbosity mode, 1 = progress bar
    validation_data=(x_val, y_val),  # Data for validation
)

Epoch 1/5
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.6576 - val_loss: 0.6202
Epoch 2/5
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.6159 - val_loss: 0.6171
Epoch 3/5
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.6086 - val_loss: 0.6135
Epoch 4/5
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.6086 - val_loss: 0.6119
Epoch 5/5
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.6083 - val_loss: 0.6114


In [8]:
# Load the movies.csv file into a DataFrame
movie_df = pd.read_csv(movielens_dir / "movies.csv")

# Let us get a user and see the top recommendations
user_id = df.userId.sample(1).iloc[0]  # Sample a random user ID

# Get the movies watched by the sampled user
movies_watched_by_user = df[df.userId == user_id]

# Get the movie IDs that have not been watched by the user
movies_not_watched = movie_df[
    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
]["movieId"]

# Filter out movies that are not in the encoded movie dictionary
movies_not_watched = list(
    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
)

# Convert movie IDs to their encoded values
movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]

# Get the encoded value for the sampled user ID
user_encoder = user2user_encoded.get(user_id)

# Create an array of the user-movie pairs for prediction
user_movie_array = np.hstack(
    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
)

# Predict ratings for the movies not watched by the user
ratings = model.predict(user_movie_array).flatten()

# Get the indices of the top 10 highest rated movies
top_ratings_indices = ratings.argsort()[-10:][::-1]

# Get the original movie IDs for the top 10 recommendations
recommended_movie_ids = [
    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
]

# Print the recommendations
print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)

# Print movies with high ratings from the user
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)

# Get and print the top 10 recommended movies
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455us/step
Showing recommendations for user: 220
Movies with high ratings from user
--------------------------------
Apollo 13 (1995) : Adventure|Drama|IMAX
Godfather, The (1972) : Crime|Drama
Aliens (1986) : Action|Adventure|Horror|Sci-Fi
Shrek (2001) : Adventure|Animation|Children|Comedy|Fantasy|Romance
Assassination of Jesse James by the Coward Robert Ford, The (2007) : Crime|Drama|Western
--------------------------------
Top 10 movie recommendations
--------------------------------
Schindler's List (1993) : Drama|War
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) : Comedy|War
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) : Drama|Film-Noir|Romance
Streetcar Named Desire, A (1951) : Drama
Alien (1979) : Horror|Sci-Fi
Godfather: Part II, The (1974) : Crime|Drama
Full Metal Jacket (1987) : Drama|War
Amadeus (1984) : Drama
Glory (1989) : Drama|War
Cool Hand Luke (1967) : Drama
