In [30]:
import json
import pandas as pd
import numpy as np
import scipy.sparse as sp
import pickle
import seaborn as sns

In [31]:
df = pd.read_csv("../data/user_rate_movie_35k.csv")
df.head()

Unnamed: 0,user_id,movie_name,year,movie_id,rating
0,13680,torso,1973,torso+1973,4
1,206902,broadway damage,1997,broadway+damage+1997,4
2,19046,terminator 2 judgment day,1991,terminator+2+judgment+day+1991,3
3,2342,light is calling,2004,light+is+calling+2004,4
4,129260,the devil and holy water,1983,the+devil+and+holy+water+1983,4


In [32]:
df.nunique()

user_id       3612
movie_name    4617
year           105
movie_id      4717
rating           5
dtype: int64

In [33]:
idx_to_mid = dict()
with open('../data/idx_to_mid_1500k.pkl', 'rb') as f:
    idx_to_mid = pickle.load(f)

mid_to_idx = dict((v,k) for k,v in idx_to_mid.items())

In [38]:
df_clean = df.copy()
df_clean['movie_num'] = df.apply (lambda row: mid_to_idx[row['movie_id']], axis=1)
df_clean = df_clean[['user_id', 'movie_num', 'rating']]
df_clean = df_clean.astype({'movie_num': 'int32'})

In [39]:
df_clean

Unnamed: 0,user_id,movie_num,rating
0,13680,13139,4
1,206902,12199,4
2,19046,292,3
3,2342,11067,4
4,129260,14031,4
...,...,...,...
35966,97978,1460,5
35967,97978,1598,4
35968,97978,1788,4
35969,197728,1838,2


In [40]:
df_clean.to_csv('../data/user_rate_movie_35k_clean.csv', index=False, header=False)

In [None]:
df_clean

In [58]:
uid_clean = df_clean['user_id'].unique()

In [59]:
with open('../data/uid_35k.npy', 'wb') as f:
    np.save(f, np.array(uid_clean))

In [None]:
df_clean['user'].to_csv

## Surprise

In [24]:
from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy, Dataset
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp, NMF
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
import surprise.dump as sdump

In [41]:
reader = Reader(line_format="user item rating", sep=",")
data = Dataset.load_from_file("../data/user_rate_movie_35k_clean.csv", reader=reader)
trainset, testset = train_test_split(data, test_size=0.25)

In [26]:
algo = NMF()

In [50]:
%%timeit

algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)
accuracy.fcp(predictions)

RMSE: 1.1210
FCP:  0.5650
RMSE: 1.1153
FCP:  0.5654
RMSE: 1.1144
FCP:  0.5643
RMSE: 1.1165
FCP:  0.5521
RMSE: 1.1241
FCP:  0.5494
RMSE: 1.1219
FCP:  0.5471
RMSE: 1.1191
FCP:  0.5583
RMSE: 1.1216
FCP:  0.5613
1.43 s ± 26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [60]:
allset = data.build_full_trainset()
algo = NMF()
algo.fit(allset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f8bd44f58e0>

### Save model

In [61]:
sdump.dump("nmf.pkl", algo=algo)

In [62]:
_, alg = sdump.load("nmf.pkl")

In [63]:
predictions = alg.test(testset)
accuracy.rmse(predictions)
accuracy.fcp(predictions)

RMSE: 0.3403
FCP:  0.7637


0.7637384711628414