# 0. Imports

In [1]:
import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
# import seaborn as sns

# from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dot
from tensorflow.keras.models import Model
# from tensorflow.keras.models import load_model


# 1. Set-up

In [2]:
ratings_df = pd.read_csv('ratings_small.csv')
ratings_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
movies_df = pd.read_csv('movies_metadata.csv')
movies_df.columns
movies_df = movies_df[['id', 'original_title', 'release_date', 'vote_average', 'vote_count']]
movies_df['year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.year
movies_df['vote_average'] = movies_df['vote_average'] / 2
movies_df['vote_importance'] = movies_df['vote_average'] * movies_df['vote_count']
movies_df = movies_df.sort_values('vote_importance', ascending=False)
movies_df = movies_df[['id', 'original_title', 'year', 'vote_average']]
movies_df.reset_index(drop=True, inplace=True)
movies_df

  movies_df = pd.read_csv('movies_metadata.csv')


Unnamed: 0,id,original_title,year,vote_average
0,27205,Inception,2010.0,4.05
1,155,The Dark Knight,2008.0,4.15
2,157336,Interstellar,2014.0,4.05
3,24428,The Avengers,2012.0,3.70
4,19995,Avatar,2009.0,3.60
...,...,...,...,...
45461,1997-08-20,"[{'iso_639_1': 'en', 'name': 'English'}]",,
45462,122662,マルドゥック・スクランブル 排気,,
45463,2012-09-29,"[{'iso_639_1': 'ja', 'name': '日本語'}]",,
45464,249260,Avalanche Sharks,,


## 1.99. List of movies in HTML

In [4]:
import pandas as pd

movies_df = pd.read_csv('movies_metadata.csv')
ratings_df = pd.read_csv('ratings_small.csv')

movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce').dropna().astype(int)
movies_df = movies_df[['id', 'original_title', 'release_date', 'vote_average', 'vote_count']]
movies_df['year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.year
movies_df['vote_average'] = movies_df['vote_average'] / 2
movies_df['vote_importance'] = movies_df['vote_average'] * movies_df['vote_count']
movies_df = movies_df.sort_values('vote_importance', ascending=False)
movies_df = movies_df[['id', 'original_title', 'year', 'vote_average']]
movies_df.reset_index(drop=True, inplace=True)


top_100_movies = movies_df[movies_df['id'].isin(ratings_df['movieId'])].head(200)

# Format the output for use in the HTML file
formatted_movies = top_100_movies[['id', 'original_title', 'year']].to_dict(orient='records')

# Print the formatted output
for movie in formatted_movies:
    print(f"{{ id: {movie['id']}, title: \"{movie['original_title']}\", year: {int(movie['year'])} }},")

{ id: 155.0, title: "The Dark Knight", year: 2008 },
{ id: 550.0, title: "Fight Club", year: 1999 },
{ id: 680.0, title: "Pulp Fiction", year: 1994 },
{ id: 603.0, title: "The Matrix", year: 1999 },
{ id: 278.0, title: "The Shawshank Redemption", year: 1994 },
{ id: 13.0, title: "Forrest Gump", year: 1994 },
{ id: 122.0, title: "The Lord of the Rings: The Return of the King", year: 2003 },
{ id: 1726.0, title: "Iron Man", year: 2008 },
{ id: 121.0, title: "The Lord of the Rings: The Two Towers", year: 2002 },
{ id: 597.0, title: "Titanic", year: 1997 },
{ id: 272.0, title: "Batman Begins", year: 2005 },
{ id: 11.0, title: "Star Wars", year: 1977 },
{ id: 22.0, title: "Pirates of the Caribbean: The Curse of the Black Pearl", year: 2003 },
{ id: 671.0, title: "Harry Potter and the Philosopher's Stone", year: 2001 },
{ id: 238.0, title: "The Godfather", year: 1972 },
{ id: 105.0, title: "Back to the Future", year: 1985 },
{ id: 1891.0, title: "The Empire Strikes Back", year: 1980 },
{ id:

  movies_df = pd.read_csv('movies_metadata.csv')


# 2. Algo

In [4]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [5]:
num_users = len(ratings_df['userId'].unique())
num_movies = len(ratings_df['movieId'].unique())
embedding_size = 50

# Create mappings from userId and movieId to a continuous range of indices
user_id_mapping = {id: idx for idx, id in enumerate(ratings_df['userId'].unique())}
movie_id_mapping = {id: idx for idx, id in enumerate(ratings_df['movieId'].unique())}

# Map the userId and movieId in the ratings DataFrame to the new indices
ratings_df['userId'] = ratings_df['userId'].map(user_id_mapping)
ratings_df['movieId'] = ratings_df['movieId'].map(movie_id_mapping)
train_df['userId'] = train_df['userId'].map(user_id_mapping)
train_df['movieId'] = train_df['movieId'].map(movie_id_mapping)
test_df['userId'] = test_df['userId'].map(user_id_mapping)
test_df['movieId'] = test_df['movieId'].map(movie_id_mapping)

# Define the RecommenderNet class, inheriting from tf.keras.Model
class RecommenderNet(Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        # Create embedding layers for users and movies
        self.user_embedding = Embedding(num_users, embedding_size, embeddings_initializer='he_normal', embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        self.movie_embedding = Embedding(num_movies, embedding_size, embeddings_initializer='he_normal', embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        # Define a dot product layer to compute the similarity between user and movie embeddings
        self.dot = Dot(axes=1)
    
    def call(self, inputs):
        # Get the embeddings for the users and movies
        user_vector = self.user_embedding(inputs[0])
        movie_vector = self.movie_embedding(inputs[1])
        # Compute the dot product of the user and movie embeddings
        dot_user_movie = self.dot([user_vector, movie_vector])
        return dot_user_movie
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'num_users': self.num_users,
            'num_movies': self.num_movies,
            'embedding_size': self.embedding_size
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)
    
# model = RecommenderNet(num_users, num_movies, embedding_size)
model = RecommenderNet(num_users, num_movies, embedding_size)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

train_user_ids = train_df['userId'].values
train_movie_ids = train_df['movieId'].values
train_ratings = train_df['rating'].values

model.fit([train_user_ids, train_movie_ids], train_ratings, batch_size=64, epochs=5, verbose=1)

Epoch 1/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 9ms/step - loss: 12.9292
Epoch 2/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 2.8543
Epoch 3/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 1.2991
Epoch 4/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 36ms/step - loss: 0.9412
Epoch 5/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 17ms/step - loss: 0.7825


<keras.src.callbacks.history.History at 0x25b205f9e10>

In [6]:
test_user_ids = test_df['userId'].values
test_movie_ids = test_df['movieId'].values
test_ratings = test_df['rating'].values

predictions = model.predict([test_user_ids, test_movie_ids], batch_size=64, verbose=1)

rmse = np.sqrt(mean_squared_error(predictions, test_ratings))
mae = mean_absolute_error(predictions, test_ratings)
r2 = r2_score(predictions, test_ratings)

print(f'RMSE: {rmse:.2f}', f'MAE: {mae:.2f}', f'R2: {r2:.2f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
RMSE: 1.12 MAE: 0.81 R2: -0.36


In [27]:
def get_best_predictions_for_users(user_ids, model, movie_id_mapping, movies_df, num_recommendations=50):
    best_predictions = {}
    
    for user_id in user_ids:
        # Create an array of all movie IDs
        all_movie_ids = np.array(list(movie_id_mapping.values()))
        
        # Create an array of the user ID repeated for the number of movies
        user_ids_array = np.array([user_id] * len(all_movie_ids))
        
        # Predict ratings for all movies for the given user
        predictions = model.predict([user_ids_array, all_movie_ids], batch_size=64, verbose=0)
        
        # Get the top N movie indices with the highest predicted ratings
        top_indices = predictions.flatten().argsort()[-num_recommendations:][::-1]
        
        # Map the indices back to movie IDs
        top_movie_ids = [list(movie_id_mapping.keys())[i] for i in top_indices]
        
        # Get the movie titles for the top movie IDs
        top_movie_titles = movies_df[movies_df['id'].astype(str).isin(map(str, top_movie_ids))]['original_title'].values
        
        # Ensure the lengths of top_movie_ids, top_movie_titles, and predictions[top_indices] are the same
        min_length = min(len(top_movie_ids), len(top_movie_titles), len(predictions[top_indices].flatten()))
        top_movie_ids = top_movie_ids[:min_length]
        top_movie_titles = top_movie_titles[:min_length]
        top_predictions = predictions[top_indices].flatten()[:min_length]
        
        # Store the top movie IDs, their titles, and their predicted ratings for the user
        best_predictions[user_id] = pd.DataFrame({
            'movie_ids': top_movie_ids,
            'movie_titles': top_movie_titles,
            'predicted_ratings': top_predictions
        })
    
    return best_predictions

# Example usage:
user_ids = [650]  # Replace with actual user IDs
best_predictions = get_best_predictions_for_users(user_ids, model, movie_id_mapping, movies_df)
best_predictions

{650:     movie_ids                                       movie_titles  \
 0       73290                            Raiders of the Lost Ark   
 1        3181                                           Scarface   
 2         969                                Mission: Impossible   
 3        3030                                    Horrible Bosses   
 4         318                                      Before Sunset   
 5        3462                               Sleepless in Seattle   
 6        1939                                          Backdraft   
 7        2924                                       Frankenstein   
 8        7075                                              Laura   
 9          85                                     Stomp the Yard   
 10      86882                           The Million Dollar Hotel   
 11       7502                                              Dread   
 12        858  Shriek If You Know What I Did Last Friday the ...   
 13      27846               

# 99. Old