In [46]:
from surprise import NMF, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split

import time
import pandas as pd
import numpy as np

In [2]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [3]:
traindf = pd.read_csv('./datasets/training_data.csv')
testdf= pd.read_csv('./datasets/testing_data.csv')
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [7]:
def recommendation(algo, trainset, testset):
    #start_fit = time.time()
    algo.fit(trainset)
    #end_fit = time.time()
    #fit_time = end_fit - start_fit
    
    #start_test = time.time()
    test_predictions = algo.test(testset)
    #end_test = time.time()
    #test_time = end_test - start_test
    
    test_rmse = accuracy.rmse(test_predictions)
    test_mae = accuracy.mae(test_predictions)
    
    return test_rmse, test_mae , test_predictions #, fit_time, test_time

In [8]:
algo = NMF()

algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)
accuracy.mse(predictions)

RMSE: 0.9147
MSE: 0.8367


0.8366899803617482

In [9]:
test_nmf_rmse, test_nmf_mae, test_nmf_pred = recommendation(algo, trainset, testset)

RMSE: 0.9170
MAE:  0.7015


In [10]:
test_pred_df = pd.DataFrame(columns=['uid', 'iid', 'og_rating', 'nmf_rating'])

In [11]:
test_nmf_df = pd.DataFrame(columns=['uid', 'iid', 'og_rating', 'est_rating'])

In [12]:
num_test = len(test_nmf_pred)
for i in range(num_test):
    nmf = test_nmf_pred[i]
    df = pd.DataFrame([[nmf.uid, nmf.iid, nmf.r_ui, nmf.est]],
                  columns=['uid', 'iid', 'og_rating', 'nmf_rating'])
    df_nmf = pd.DataFrame([[nmf.uid, nmf.iid, nmf.r_ui, nmf.est]],
                          columns=['uid', 'iid', 'og_rating', 'est_rating'])
    test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)
    test_nmf_df = pd.concat([df_nmf, test_nmf_df], ignore_index=True)

In [13]:
#test_pred_df.to_csv('test_prediction_HP.csv')
#test_nmf_df.to_csv('test_predictions_nmf.csv')

In [14]:
test_pred_df

Unnamed: 0,uid,iid,og_rating,nmf_rating
0,610,170875,3.0,2.213481
1,610,164179,5.0,4.112897
2,610,163937,3.5,3.501574
3,610,162350,3.5,3.933006
4,610,158238,5.0,4.078605
...,...,...,...,...
20163,1,349,4.0,4.510272
20164,1,333,5.0,3.67655
20165,1,235,4.0,4.653805
20166,1,47,5.0,4.784134


In [15]:
test_nmf_df

Unnamed: 0,uid,iid,og_rating,est_rating
0,610,170875,3.0,2.213481
1,610,164179,5.0,4.112897
2,610,163937,3.5,3.501574
3,610,162350,3.5,3.933006
4,610,158238,5.0,4.078605
...,...,...,...,...
20163,1,349,4.0,4.510272
20164,1,333,5.0,3.67655
20165,1,235,4.0,4.653805
20166,1,47,5.0,4.784134


In [16]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    org_ratings = defaultdict(list)
    
    for row in predictions.itertuples(index=False):
        uid, iid, og_rating, est_rating = row.uid, row.iid, row.og_rating, row.est_rating
        top_n[uid].append((iid, est_rating))
        org_ratings[uid].append((iid, og_rating))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)  
        top_n[uid] = user_ratings[:n]
    
    return top_n, org_ratings


In [17]:
d = get_top_n(test_nmf_df, 5)

In [18]:
#d

In [19]:
type(d)

tuple

In [20]:
type(d[0])

collections.defaultdict

In [21]:
type(d[0][0])

list

In [22]:
d[0][609]

[(296, 3.7833211665862243),
 (150, 3.4150392922048365),
 (892, 3.2515057489582717),
 (339, 3.1319069187952655),
 (185, 2.846682398284024)]

In [23]:
d[0][5][0]

(475, 4.261943465858062)

In [24]:
df_movies = pd.read_csv('./datasets/Movies.csv')

In [25]:
recommended_movies = [296, 150, 892, 339, 185]

In [26]:
df_movies.loc[df_movies['movieId'].isin(recommended_movies)]

Unnamed: 0,movieId,title,genres
123,150,Apollo 13 (1995),"['Adventure', 'Drama', 'IMAX']"
156,185,"Net, The (1995)","['Action', 'Crime', 'Thriller']"
257,296,Pulp Fiction (1994),"['Comedy', 'Crime', 'Drama', 'Thriller']"
297,339,While You Were Sleeping (1995),"['Comedy', 'Romance']"
676,892,Twelfth Night (1996),"['Comedy', 'Drama', 'Romance']"


In [39]:
uid = []
rat = []

def combine_dicts(d, col1, col2):
    
    for x in d:
        for key, value in x.items():
            col1.append(key)
            col2.append(value)
    return col1, col2

In [42]:
uid, rat = combine_dicts(d, uid, rat)

In [49]:
uid = pd.Series(uid)

In [50]:
type(uid)

pandas.core.series.Series

In [54]:
uid.drop_duplicates

<bound method Series.drop_duplicates of 0       610
1       609
2       608
3       607
4       606
       ... 
2437      5
2438      4
2439      3
2440      2
2441      1
Length: 2442, dtype: int32>

In [59]:
uid.sort_values(ascending=True, kind="mergesort")

610       0
1831      0
609       1
1220      1
1830      1
       ... 
1833    609
0       610
611     610
1221    610
1832    610
Length: 2442, dtype: int32