In [27]:
from surprise import NMF, Dataset, Reader, SVD, SVDpp
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
from collections import defaultdict

from UserDefinedNMF import UserDefinedNMF

In [28]:
def load_data(file_path):
    df = pd.read_csv(file_path, usecols=['userId', 'movieId', 'rating'])
    return df

In [29]:
def prepare_data_for_surprise(dataframe):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(dataframe[['userId', 'movieId', 'rating']], reader)
    return data

In [30]:
traindf = load_data('./datasets/training_data.csv')

data = prepare_data_for_surprise(traindf)

In [31]:
user_ids = traindf['userId'].unique()
user_id_map = {old: new for new, old in enumerate(user_ids)}
item_ids = traindf['movieId'].unique()
item_id_map = {old: new for new, old in enumerate(item_ids)}

traindf['userId'] = traindf['userId'].map(user_id_map)
traindf['movieId'] = traindf['movieId'].map(item_id_map)

In [32]:
model = UserDefinedNMF()
model.fit(data.build_full_trainset())
u, i = 1, 2
prediction = model.predict(u, i)
print(f"Predicted rating for user {u} and item {i} is {prediction}")

Predicted rating for user 1 and item 2 is 3.7821834775461167


In [33]:
#param_grid = {
#    'n_factors': [15],
#    'n_epochs': [30],
#    #'init_mean': [0, 0.05],  #SVD feature 
#    #'init_std_dev': [0.05, 0.1, 0.15],  #SVD feature
#    #'lr_all': [0.005, 0.01],  #SVD feature 
#    #'reg_all': [0.02, 0.05],  #SVD feature 
#    'reg_pu' : [0.12],  #NMF feature
#    'reg_qi' : [0.12],  #NMF feature
#    'reg_bu' : [0.04],  #NMF feature
#    'reg_bi' : [0.02],  #NMF feature
#    'lr_bu' : [0.01],  #NMF feature
#    'lr_bi' : [0.01]  #NMF feature
#}
#
##gs = GridSearchCV(SVD, param_grid, measures = ['rmse', 'mae'], cv = 5)
#gs = GridSearchCV(NMF, param_grid, measures = ['rmse', 'mae'], cv = 5)
#
#gs.fit(data)
#
#print(f"Best RMSE score achieved: {gs.best_score['rmse']}")
#
#print(f"Best parameters: {gs.best_params['rmse']}")

In [34]:
testdf = load_data('./datasets/testing_data.csv')
testdf['userId'] = testdf['userId'].map(user_id_map, na_action='ignore')
testdf['movieId'] = testdf['movieId'].map(item_id_map, na_action='ignore')
trainset = data.build_full_trainset()
testset = [tuple(row) for row in testdf.itertuples(index=False)]

In [35]:
predictions = model.test(testset)

In [36]:
#algo = SVD(**gs.best_params['rmse'])
#algo = NMF(**gs.best_params['rmse'])
#
#algo.fit(trainset)
#
#predictions = algo.test(testset)

In [37]:
#def calculate_accuracy(predictions, threshold=0.5):
#    correct_predictions = sum(abs(pred.est - pred.r_ui) <= threshold for pred in predictions)
#    accuracy = correct_predictions / len(predictions)
#    return accuracy

In [38]:
def calculate_accuracy(predictions, threshold=0.5):
    correct_predictions = sum(abs(estimated_rating - true_rating) <= threshold for _, _, true_rating, estimated_rating in predictions)
    accuracy = correct_predictions / len(predictions)
    return accuracy

In [39]:
threshold = 0.5
test_accuracy = calculate_accuracy(predictions, threshold)

print(f"Accuracy (within ±{threshold} of actual rating): {test_accuracy:.2%}")

Accuracy (within ±0.5 of actual rating): 46.06%


In [40]:
def create_predictions_dataframe(predictions):
    predictions_data = [{'uid': pred[0], 'iid': pred[1], 'og_rating': pred[2], 'nmf_rating': pred[3]} for pred in predictions]
    return pd.DataFrame(predictions_data)

test_pred_df = create_predictions_dataframe(predictions)

In [41]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [42]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    org_ratings = defaultdict(list)
    
    for row in predictions.itertuples(index=False):
        uid, iid, og_rating, nmf_rating = row.uid, row.iid, row.og_rating, row.nmf_rating
        top_n[uid].append((iid, nmf_rating))
        org_ratings[uid].append((iid, og_rating))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)  
        top_n[uid] = user_ratings[:n]
    
    return top_n, org_ratings

In [43]:
def show_user_recommended_movies_in_df(predictions, user_id, n=10):
    top_n, org_ratings = get_top_n(predictions, n)
    
    if user_id in top_n:
        recommended_movies_ids = [movie_id for movie_id, _ in top_n[user_id]]
    else:
        return pd.DataFrame()
    
    recommended_movies_df = df_movies.loc[df_movies['movieId'].isin(recommended_movies_ids)]
    
    return recommended_movies_df

In [51]:
for i in range(1,500):
    print("**{}**".format(i))
    print(show_user_recommended_movies_in_df(test_pred_df, i, 5))

**1**
      movieId                                       title genres
519       606      Candyman: Farewell to the Flesh (1995)     []
630       802                           Phenomenon (1996)     []
1342     1825                     The Players Club (1998)     []
1483     2008       This World, Then the Fireworks (1997)     []
1759     2357  Central Station (Central do Brasil) (1998)     []
**2**
      movieId                                 title genres
22         23                      Assassins (1995)     []
147       175                           Kids (1995)     []
222       258  Kid in King Arthur's Court, A (1995)     []
1068     1388                         Jaws 2 (1978)     []
1205     1603                          Mimic (1997)     []
**3**
      movieId                                 title genres
45         49          When Night Is Falling (1995)     []
1198     1596                   Career Girls (1997)     []
1999     2660  Thing from Another World, The (1951)     []
**

In [50]:
show_user_recommended_movies_in_df(test_pred_df, 1, 5)

Unnamed: 0,movieId,title,genres
519,606,Candyman: Farewell to the Flesh (1995),[]
630,802,Phenomenon (1996),[]
1342,1825,The Players Club (1998),[]
1483,2008,"This World, Then the Fireworks (1997)",[]
1759,2357,Central Station (Central do Brasil) (1998),[]
