In [56]:
from surprise import NMF, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
from collections import defaultdict

In [57]:
def load_data(file_path):
    df = pd.read_csv(file_path, usecols=['userId', 'movieId', 'rating'])
    return df

In [58]:
def prepare_data_for_surprise(dataframe):
    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(dataframe[['userId', 'movieId', 'rating']], reader)
    return data

In [59]:
traindf = load_data('./datasets/training_data.csv')

data = prepare_data_for_surprise(traindf)

In [60]:
param_grid = {
    'n_factors': [5, 10, 15],
    'n_epochs': [10, 30, 50],
    'lr_bu': [0.005, 0.01],
    'lr_bi': [0.005, 0.01],
    'reg_qi': [0.02, 0.05],
    'reg_bu': [0.02],
    'reg_bi': [0.02]
}

gs = GridSearchCV(NMF, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

print(f"Best RMSE score achieved: {gs.best_score['rmse']}")

print(f"Best parameters: {gs.best_params['rmse']}")

Best RMSE score achieved: 0.9518364736897685
Best parameters: {'n_factors': 15, 'n_epochs': 30, 'lr_bu': 0.01, 'lr_bi': 0.01, 'reg_pu': 0.05, 'reg_qi': 0.05, 'reg_bu': 0.02, 'reg_bi': 0.02}


In [61]:
testdf = load_data('./datasets/testing_data.csv')
trainset = data.build_full_trainset()
testset = [tuple(row) for row in testdf.itertuples(index=False)]

In [62]:
algo = NMF(**gs.best_params['rmse'])

algo.fit(trainset)

predictions = algo.test(testset)

In [63]:
def calculate_accuracy(predictions, threshold=0.5):
    correct_predictions = sum(abs(pred.est - pred.r_ui) <= threshold for pred in predictions)
    accuracy = correct_predictions / len(predictions)
    return accuracy

In [64]:
threshold = 0.5
test_accuracy = calculate_accuracy(predictions, threshold)

print(f"Accuracy (within ±{threshold} of actual rating): {test_accuracy:.2%}")

Accuracy (within ±0.5 of actual rating): 45.49%


In [65]:
def create_predictions_dataframe(predictions):
    predictions_data = [{'uid': pred.uid, 'iid': pred.iid, 'og_rating': pred.r_ui, 'nmf_rating': pred.est} for pred in predictions]
    return pd.DataFrame(predictions_data)

test_pred_df = create_predictions_dataframe(predictions)

In [66]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [67]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    org_ratings = defaultdict(list)
    
    for row in predictions.itertuples(index=False):
        uid, iid, og_rating, nmf_rating = row.uid, row.iid, row.og_rating, row.nmf_rating
        top_n[uid].append((iid, nmf_rating))
        org_ratings[uid].append((iid, og_rating))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)  
        top_n[uid] = user_ratings[:n]
    
    return top_n, org_ratings

In [68]:
def show_user_recommended_movies_in_df(predictions, user_id, n=10):
    top_n, org_ratings = get_top_n(predictions, n)
    
    if user_id in top_n:
        recommended_movies_ids = [movie_id for movie_id, _ in top_n[user_id]]
    else:
        return pd.DataFrame()
    
    recommended_movies_df = df_movies.loc[df_movies['movieId'].isin(recommended_movies_ids)]
    
    return recommended_movies_df

In [70]:
show_user_recommended_movies_in_df(test_pred_df, 550, 5)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4159,5989,Catch Me If You Can (2002),Crime|Drama
6710,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
7693,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
8569,116797,The Imitation Game (2014),Drama|Thriller|War
