In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df_links    = pd.read_csv('./links.csv')
df_movies   = pd.read_csv('./movies.csv')
df_ratings  = pd.read_csv('./ratings.csv')
df_tags     = pd.read_csv('./tags.csv')

In [3]:
user_item_matrix = df_ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

In [4]:
nmf_model = NMF(n_components=50, init='random', random_state=0)
W = nmf_model.fit_transform(user_item_matrix)
H = nmf_model.components_




In [5]:
predicted_ratings = np.dot(W, H)


In [6]:
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)


In [7]:
def recommend_movies_nmf(user_id, top_n=10):
    user_ratings = user_item_matrix.loc[user_id]

    user_predicted_ratings = predicted_ratings_df.loc[user_id].drop(user_ratings[user_ratings > 0].index)
    
    top_n_movie_ids = user_predicted_ratings.sort_values(ascending=False).head(top_n).index
    recommended_movies = df_movies[df_movies['movieId'].isin(top_n_movie_ids)]
    return recommended_movies


In [14]:
recommend_movies_nmf(2,10)

Unnamed: 0,movieId,title,genres
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
461,527,Schindler's List (1993),Drama|War
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller
3638,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
4137,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
4800,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
4909,7361,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
7039,68954,Up (2009),Adventure|Animation|Children|Drama


In [8]:
actual = user_item_matrix.values.flatten()
predicted = predicted_ratings_df.values.flatten()

In [9]:
mask = actual > 0
actual_filtered = actual[mask]
predicted_filtered = predicted[mask]

In [10]:
rmse = sqrt(mean_squared_error(actual_filtered, predicted_filtered))
print(f'RMSE: {rmse}')

RMSE: 2.1635191183667617


In [11]:
actual_filtered

array([4., 4., 4., ..., 5., 5., 3.])

In [12]:
predicted_filtered

array([1.94825985, 0.52688935, 2.6951439 , ..., 4.95291658, 4.96417   ,
       2.97993529])