# 0. Imports

In [59]:
import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
# import seaborn as sns

# from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dot
from tensorflow.keras.models import Model
# from tensorflow.keras.models import load_model

# 1. Set-up

In [60]:
ratings_df = pd.read_csv('ratings_small.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [61]:
movies_df = pd.read_csv('movies_metadata.csv')
movies_df.columns
movies_df = movies_df[['id', 'original_title', 'release_date', 'vote_average', 'vote_count']]
movies_df['year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.year
movies_df['vote_average'] = movies_df['vote_average'] / 2
movies_df['vote_importance'] = movies_df['vote_average'] * movies_df['vote_count']
movies_df = movies_df.sort_values('vote_importance', ascending=False)
movies_df = movies_df[['id', 'original_title', 'year', 'vote_average']]
movies_df.reset_index(drop=True, inplace=True)
movies_df.head(5)

  movies_df = pd.read_csv('movies_metadata.csv')


Unnamed: 0,id,original_title,year,vote_average
0,27205,Inception,2010.0,4.05
1,155,The Dark Knight,2008.0,4.15
2,157336,Interstellar,2014.0,4.05
3,24428,The Avengers,2012.0,3.7
4,19995,Avatar,2009.0,3.6


# 2. New user

In [75]:
# New user added
top_10_movies = movies_df[movies_df['id'].isin(ratings_df['movieId'])].head(10)
new_user_ratings = pd.DataFrame({
    'userId': [672] * 10,
    'movieId': top_10_movies['id'],
    # 'rating': [5, 4.5, 4, 4.5, 5, 3.5, 4, 4.5, 5, 4]
    # 'rating': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
    # 'rating': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    'rating': [5] * 10
})
ratings_df = pd.concat([ratings_df, new_user_ratings], ignore_index=True)
ratings_df.tail(5)

Unnamed: 0,userId,movieId,rating,timestamp
100039,672,5.0,5.0,
100040,672,8012.0,5.0,
100041,672,451.0,5.0,
100042,672,902.0,5.0,
100043,672,63.0,5.0,


In [78]:
# top_10_movies = movies_df[movies_df['id'].isin(ratings_df['movieId'])].head(10)
# # top_10_movies = movies_df.head(10)
# top_10_movies

# 3. Algo

In [67]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dot

class RecommenderNet(Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        # Create embedding layers for users and movies
        self.user_embedding = Embedding(num_users, embedding_size, embeddings_initializer='he_normal', embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        self.movie_embedding = Embedding(num_movies, embedding_size, embeddings_initializer='he_normal', embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        # Define a dot product layer to compute the similarity between user and movie embeddings
        self.dot = Dot(axes=1)
    
    def call(self, inputs):
        # Get the embeddings for the users and movies
        user_vector = self.user_embedding(inputs[0])
        movie_vector = self.movie_embedding(inputs[1])
        # Compute the dot product of the user and movie embeddings
        dot_user_movie = self.dot([user_vector, movie_vector])
        return dot_user_movie
    
    def get_config(self):
        config = super().get_config()
        config.update({
            'num_users': self.num_users,
            'num_movies': self.num_movies,
            'embedding_size': self.embedding_size
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)
    
    def recommend_top_n(self, user_id, movie_ids, n=20):
        user_array = tf.constant([user_id] * len(movie_ids))
        movie_array = tf.constant(movie_ids)
        predictions = self.call([user_array, movie_array])
        top_n_indices = tf.argsort(predictions, axis=0, direction='DESCENDING')[:n]
        top_n_movie_ids = tf.gather(movie_ids, top_n_indices)
        top_n_ratings = tf.gather(predictions, top_n_indices)
        return top_n_movie_ids.numpy().flatten(), top_n_ratings.numpy().flatten()

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the ratings and movies metadata
ratings_df = pd.read_csv('ratings_small.csv')
movies_df = pd.read_csv('movies_metadata.csv')

# Ensure the movie IDs are integers
movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce').dropna().astype(int)

# New user added
top_10_movies = movies_df[movies_df['id'].isin(ratings_df['movieId'])].head(10)
new_user_ratings = pd.DataFrame({
    'userId': [672] * 10,
    'movieId': top_10_movies['id'],
    # 'rating': [5, 4.5, 4, 4.5, 5, 3.5, 4, 4.5, 5, 4]
    # 'rating': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
    # 'rating': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    'rating': [5] * 10
})
ratings_df = pd.concat([ratings_df, new_user_ratings], ignore_index=True)

# Select relevant columns from movies_df
movies_df = movies_df[['id', 'original_title', 'release_date']]
movies_df['year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.year
movies_df = movies_df[['id', 'original_title', 'year']]

# Map userId and movieId to continuous range of indices
user_id_mapping = {id: idx for idx, id in enumerate(ratings_df['userId'].unique())}
movie_id_mapping = {id: idx for idx, id in enumerate(ratings_df['movieId'].unique())}

# Apply the mappings to the ratings DataFrame
ratings_df['userId'] = ratings_df['userId'].map(user_id_mapping)
ratings_df['movieId'] = ratings_df['movieId'].map(movie_id_mapping)

# Filter movies_df to include only movies that exist in ratings_df
movies_df = movies_df[movies_df['id'].isin(movie_id_mapping.keys())]

# Add mapped_id to movies_df
movies_df['mapped_id'] = movies_df['id'].map(movie_id_mapping)

# Split the data into training and testing sets
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

print("Data preparation completed.")
print("Movies DataFrame with Mapped IDs:")
print(movies_df.head())

  movies_df = pd.read_csv('movies_metadata.csv')


Data preparation completed.
Movies DataFrame with Mapped IDs:
        id         original_title    year  mapped_id
5    949.0                   Heat  1995.0       2285
9    710.0              GoldenEye  1995.0       6584
14  1408.0       Cutthroat Island  1995.0        446
15   524.0                 Casino  1995.0        451
16  4584.0  Sense and Sensibility  1995.0       6037


In [69]:
# Initialize and compile the model
num_users = len(ratings_df['userId'].unique())
num_movies = len(ratings_df['movieId'].unique())
embedding_size = 50

model = RecommenderNet(num_users, num_movies, embedding_size)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
train_user_ids = train_df['userId'].values
train_movie_ids = train_df['movieId'].values
train_ratings = train_df['rating'].values

model.fit([train_user_ids, train_movie_ids], train_ratings, batch_size=64, epochs=5, verbose=1)

print("Model training completed.")

Epoch 1/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - loss: 12.9895
Epoch 2/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - loss: 2.9858
Epoch 3/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - loss: 1.3094
Epoch 4/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - loss: 0.9169
Epoch 5/5
[1m1251/1251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - loss: 0.7541
Model training completed.


In [72]:
def recommend_movies(user_id, model, movies_df, movie_id_mapping, num_recommendations=20):
    all_movie_ids = movies_df['id'].map(movie_id_mapping).dropna().astype(int).values
    print("All Movie IDs:", all_movie_ids)
    top_movie_ids, top_ratings = model.recommend_top_n(user_id, all_movie_ids, n=num_recommendations)
    print("Top Movie IDs:", top_movie_ids)
    print("Top Ratings:", top_ratings)
    recommended_movies = movies_df[movies_df['id'].map(movie_id_mapping).isin(top_movie_ids)]
    recommended_movies['predicted_rating'] = top_ratings
    return recommended_movies

# Example usage
user_id = 671  # Replace with the actual user ID
recommended_movies = recommend_movies(user_id, model, movies_df, movie_id_mapping)
print("Recommended Movies:")
recommended_movies.head(20)

All Movie IDs: [2285 6584  446 ... 3932 6544 5056]
Top Movie IDs: [4236 4214 2135  431 2499   99  685 4407  702   79 1945  505 2207   53
  241  157  724 5959  783   69]
Top Ratings: [4.14564   4.076818  4.0461617 3.968922  3.9674306 3.9643285 3.891684
 3.8756282 3.8738122 3.861242  3.8494825 3.8408532 3.8393838 3.802633
 3.8000803 3.7832808 3.77717   3.7727046 3.7726388 3.760411 ]
Recommended Movies:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_movies['predicted_rating'] = top_ratings


Unnamed: 0,id,original_title,year,mapped_id,predicted_rating
286,527.0,Once Were Warriors,1994.0,79,4.14564
334,2064.0,While You Were Sleeping,1995.0,241,4.076818
534,858.0,Sleepless in Seattle,1993.0,157,4.046162
550,319.0,True Romance,1993.0,53,3.968922
915,1939.0,Laura,1944.0,4407,3.967431
1052,475.0,Bonnie and Clyde,1967.0,2135,3.964329
1172,766.0,Army of Darkness,1992.0,4236,3.891684
2216,73.0,American History X,1998.0,2499,3.875628
2647,745.0,The Sixth Sense,1999.0,431,3.873812
4020,318.0,The Million Dollar Hotel,2000.0,99,3.861242


In [80]:
def show_ratings_for_user(user_id, ratings_df, user_id_mapping):
    # Map the user_id to the internal user index
    internal_user_id = user_id_mapping.get(user_id)
    if internal_user_id is None:
        return f"User ID {user_id} not found."

    # Filter the ratings dataframe for the given user ID
    user_ratings = ratings_df[ratings_df['userId'] == internal_user_id]
    return user_ratings

# Example usage
user_id = 672  # Replace with the actual user ID
user_ratings = show_ratings_for_user(user_id, ratings_df, user_id_mapping)
print("User Ratings:")
print(user_ratings)


User Ratings:
        userId  movieId  rating  timestamp
100004     671   2285.0     5.0        NaN
100005     671   6584.0     5.0        NaN
100006     671    446.0     5.0        NaN
100007     671    451.0     5.0        NaN
100008     671   6037.0     5.0        NaN
100009     671    651.0     5.0        NaN
100010     671   4703.0     5.0        NaN
100011     671   2132.0     5.0        NaN
100012     671   2154.0     5.0        NaN
100013     671   2091.0     5.0        NaN


# 4. As of Copilot