In [1]:
from surprise import NMF, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split

import time
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [3]:
traindf = pd.read_csv('./datasets/training_data.csv')
testdf= pd.read_csv('./datasets/testing_data.csv')
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [4]:
def recommendation(algo, trainset, testset):
    #start_fit = time.time()
    algo.fit(trainset)
    #end_fit = time.time()
    #fit_time = end_fit - start_fit
    
    #start_test = time.time()
    test_predictions = algo.test(testset)
    #end_test = time.time()
    #test_time = end_test - start_test
    
    test_rmse = accuracy.rmse(test_predictions)
    test_mae = accuracy.mae(test_predictions)
    
    return test_rmse, test_mae , test_predictions #, fit_time, test_time

In [5]:
algo = NMF()

algo.fit(trainset)
predictions = algo.test(testset)

#accuracy.rmse(predictions)
#accuracy.mse(predictions)

In [6]:
test_nmf_rmse, test_nmf_mae, test_nmf_pred = recommendation(algo, trainset, testset)

RMSE: 0.9272
MAE:  0.7124


In [7]:
def calculate_accuracy(predictions, threshold=0.5):
    # Calculate the number of predictions within the threshold
    correct_predictions = sum(abs(pred.est - pred.r_ui) <= threshold for pred in predictions)
    accuracy = correct_predictions / len(predictions)
    return accuracy

In [8]:
threshold = 0.9224
test_accuracy = calculate_accuracy(test_nmf_pred, threshold)

print(f"Accuracy (within ±{threshold} of actual rating): {test_accuracy:.2%}")

Accuracy (within ±0.9224 of actual rating): 71.07%


In [9]:
test_pred_df = pd.DataFrame(columns=['uid', 'iid', 'og_rating', 'nmf_rating'])

In [10]:
num_test = len(test_nmf_pred)
for i in range(num_test):
    nmf = test_nmf_pred[i]
    df = pd.DataFrame([[nmf.uid, nmf.iid, nmf.r_ui, nmf.est]],
                  columns=['uid', 'iid', 'og_rating', 'nmf_rating'])
    df_nmf = pd.DataFrame([[nmf.uid, nmf.iid, nmf.r_ui, nmf.est]],
                          columns=['uid', 'iid', 'og_rating', 'est_rating'])
    test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)

In [11]:
#test_pred_df.to_csv('test_prediction_HP.csv')

In [12]:
test_pred_df

Unnamed: 0,uid,iid,og_rating,nmf_rating
0,610,166534,4.0,3.774134
1,610,160341,2.5,3.500961
2,610,160080,3.0,2.898985
3,610,152081,4.0,4.119883
4,610,152077,4.0,3.472899
...,...,...,...,...
20163,1,296,3.0,4.899373
20164,1,231,5.0,3.617588
20165,1,110,4.0,4.957850
20166,1,3,4.0,4.442357


In [13]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [14]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    org_ratings = defaultdict(list)
    
    for row in predictions.itertuples(index=False):
        uid, iid, og_rating, nmf_rating = row.uid, row.iid, row.og_rating, row.nmf_rating
        top_n[uid].append((iid, nmf_rating))
        org_ratings[uid].append((iid, og_rating))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)  
        top_n[uid] = user_ratings[:n]
    
    return top_n, org_ratings


In [15]:
def show_user_recommended_movies_in_df(predictions, user_id, n=10):
    top_n, org_ratings = get_top_n(predictions, n)
    
    if user_id in top_n:
        recommended_movies_ids = [movie_id for movie_id, _ in top_n[user_id]]
    else:
        return pd.DataFrame()
    
    recommended_movies_df = df_movies.loc[df_movies['movieId'].isin(recommended_movies_ids)]
    
    return recommended_movies_df

In [16]:
show_user_recommended_movies_in_df(test_pred_df, 550, 5)

Unnamed: 0,movieId,title,genres
4159,5989,Catch Me If You Can (2002),Crime|Drama
6710,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
7646,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
7693,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
8569,116797,The Imitation Game (2014),Drama|Thriller|War
