In [1]:
from surprise import Dataset, Reader
import pandas as pd
import numpy as np
from collections import defaultdict

from utils.SurpriseBasedNMF import UserDefinedNMF

In [2]:
def load_data(file_path):
    df = pd.read_csv(file_path, usecols=['userId', 'movieId', 'rating'])
    return df

In [3]:
def prepare_data_for_surprise(dataframe):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(dataframe[['userId', 'movieId', 'rating']], reader)
    return data

In [4]:
traindf = load_data('./datasets/training_data.csv')

data = prepare_data_for_surprise(traindf)

In [5]:
user_ids = traindf['userId'].unique()
user_id_map = {old: new for new, old in enumerate(user_ids)}
item_ids = traindf['movieId'].unique()
item_id_map = {old: new for new, old in enumerate(item_ids)}

traindf['userId'] = traindf['userId'].map(user_id_map)
traindf['movieId'] = traindf['movieId'].map(item_id_map)

In [6]:
model = UserDefinedNMF()

model.fit(data.build_full_trainset())

In [7]:
u, i = 1, 2
prediction = model.predict(u, i)
print(f"Predicted rating for user {u} and item {i} is {prediction}")

Predicted rating for user 1 and item 2 is 4.2318336277485


In [8]:
testdf = load_data('./datasets/testing_data.csv')
testdf['userId'] = testdf['userId'].map(user_id_map, na_action='ignore')
testdf['movieId'] = testdf['movieId'].map(item_id_map, na_action='ignore')
trainset = data.build_full_trainset()
testset = [tuple(row) for row in testdf.itertuples(index=False)]

In [9]:
predictions = model.test(testset)

In [10]:
predictions_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est'])
predictions_df.describe()

Unnamed: 0,uid,iid,rui,est
count,18043.0,18043.0,18043.0,18043.0
mean,321.772377,1099.526354,3.563847,3.74422
std,182.610173,878.233695,1.030476,0.5
min,0.0,0.0,0.5,1.451125
25%,169.0,360.0,3.0,3.426255
50%,317.0,868.0,4.0,3.78352
75%,476.0,1696.0,4.0,4.095901
max,609.0,3298.0,5.0,5.0


In [11]:
def calculate_accuracy(predictions, threshold=0.5):
    correct_predictions = sum(abs(estimated_rating - true_rating) <= threshold for _, _, true_rating, estimated_rating in predictions)
    accuracy = correct_predictions / len(predictions)
    return accuracy

In [12]:
threshold = 0.5
test_accuracy = calculate_accuracy(predictions, threshold)

print(f"Accuracy (within ±{threshold} of actual rating): {test_accuracy:.2%}")

Accuracy (within ±0.5 of actual rating): 48.31%


In [13]:
def create_predictions_dataframe(predictions):
    predictions_data = [{'uid': pred[0], 'iid': pred[1], 'og_rating': pred[2], 'nmf_rating': pred[3]} for pred in predictions]
    return pd.DataFrame(predictions_data)

test_pred_df = create_predictions_dataframe(predictions)

In [14]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [15]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    
    for row in predictions.itertuples(index=False):
        uid, iid, nmf_rating = row.uid, row.iid, row.nmf_rating
        top_n[uid].append((iid, nmf_rating))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)  
        top_n[uid] = user_ratings[:n]
    
    return top_n

In [16]:
def show_user_recommended_movies_in_df(predictions, user_id, n=10):
    top_n = get_top_n(predictions, n)
    
    if user_id in top_n:
        recommended_movies_ids = [movie_id for movie_id, _ in top_n[user_id]]
    else:
        return pd.DataFrame()
    
    recommended_movies_df = df_movies.loc[df_movies['movieId'].isin(recommended_movies_ids)]
    
    return recommended_movies_df

In [17]:
for i in range(1,2):
    print("\t\t\t\t*****{}*****\n".format(i))
    print(show_user_recommended_movies_in_df(test_pred_df, i, 5))
    print("\n")

				*****1*****

     movieId                                              title  \
143      171                                     Jeffrey (1995)   
168      199  Umbrellas of Cherbourg, The (Parapluies de Che...   
580      711                                     Flipper (1996)   

                              genres  
143              ['Comedy', 'Drama']  
168  ['Drama', 'Musical', 'Romance']  
580        ['Adventure', 'Children']  




In [18]:
show_user_recommended_movies_in_df(test_pred_df, 403, 5)

Unnamed: 0,movieId,title,genres
24,25,Leaving Las Vegas (1995),"['Drama', 'Romance']"
299,341,Double Happiness (1994),['Drama']
331,373,Red Rock West (1992),['Thriller']
389,448,Fearless (1993),['Drama']


In [19]:
def calculate_rmse(predictions):
    mse = np.mean([(true_r - est_r) ** 2 for _, _, true_r, est_r in predictions])
    return np.sqrt(mse)

In [20]:
predictions = model.test(testset)

rmse = calculate_rmse(predictions)
print(f"Model RMSE: {rmse}")


Model RMSE: 0.8803950466395636
