In [10]:
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from surprise import Dataset, Reader

In [11]:
def load_data(file_path):
    df = pd.read_csv(file_path, usecols=['userId', 'movieId', 'rating'])
    return df

def prepare_data_for_surprise(dataframe):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(dataframe[['userId', 'movieId', 'rating']], reader)
    return data

traindf = load_data('./datasets/training_data.csv')

data = prepare_data_for_surprise(traindf)

In [12]:
ratings_matrix_preprocessed = traindf.groupby(['userId', 'movieId'])['rating'].mean().reset_index()

ratings_matrix = ratings_matrix_preprocessed.pivot(index='userId', columns='movieId', values='rating').fillna(0)
R = ratings_matrix.values

In [13]:
n_factors = 15
n_epochs = 30

In [14]:
nmf_sklearn = NMF(n_components=n_factors, max_iter=n_epochs, init='random', random_state=0)
W = nmf_sklearn.fit_transform(R)
H = nmf_sklearn.components_

predicted_ratings = np.dot(W, H)
actual_ratings = R



In [15]:
mse = mean_squared_error(actual_ratings, predicted_ratings)
rmse_sklearn = np.sqrt(mse)

print(f"Scikit-learn NMF RMSE: {rmse_sklearn}")

Scikit-learn NMF RMSE: 0.5573695035192058


In [16]:
def calculate_threshold_accuracy(actual_ratings, predicted_ratings, threshold=0.5):
    differences = np.abs(actual_ratings - predicted_ratings)
    correct_predictions = np.sum(differences <= threshold)
    total_predictions = np.product(differences.shape)
    accuracy = correct_predictions / total_predictions
    return accuracy

threshold = 0.5

threshold_accuracy = calculate_threshold_accuracy(actual_ratings, predicted_ratings, threshold=0.5)
print(f"Accuracy (within ±{threshold} of actual rating): {threshold_accuracy:.2%}")


Accuracy (within ±0.5 of actual rating): 92.03%


  threshold_accuracy = calculate_threshold_accuracy(actual_ratings, predicted_ratings, threshold=0.5)


In [17]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [18]:
def show_user_recommendations_sklearn(user_id, predicted_ratings, ratings_matrix, df_movies, n_recommendations=5):
    user_predicted_ratings = predicted_ratings[user_id]

    top_movie_indices = np.argsort(-user_predicted_ratings)[:n_recommendations]

    top_movie_ids = ratings_matrix.columns[top_movie_indices].tolist()

    recommended_movies_df = df_movies[df_movies['movieId'].isin(top_movie_ids)]

    return recommended_movies_df

user_id = 3
n_recommendations = 5
recommended_movies_df = show_user_recommendations_sklearn(user_id, predicted_ratings, ratings_matrix, df_movies, n_recommendations)

recommended_movies_df

Unnamed: 0,movieId,title,genres
659,858,"Godfather, The (1972)","['Crime', 'Drama']"
686,904,Rear Window (1954),"['Mystery', 'Thriller']"
863,1136,Monty Python and the Holy Grail (1975),"['Adventure', 'Comedy', 'Fantasy']"
946,1247,"Graduate, The (1967)","['Comedy', 'Drama', 'Romance']"
2145,2858,American Beauty (1999),"['Drama', 'Romance']"
