In [1]:
import numpy as np
import matplotlib.pylab as plt
import scipy.stats as stat
import pandas as pd

%matplotlib inline

In [2]:
import common
import em

In [3]:
X = np.loadtxt('test_incomplete.txt')
X_true = np.loadtxt('test_complete.txt')

K = 4
n, d = X.shape
seed = 0

mixture, post = common.init(X, K, seed)
print(mixture.mu)
print(mixture.var)
print(mixture.p)

[[2. 4. 5. 5. 0.]
 [3. 5. 0. 4. 3.]
 [2. 5. 4. 4. 2.]
 [0. 5. 3. 3. 3.]]
[5.93 4.87 3.99 4.51]
[0.25 0.25 0.25 0.25]


In [4]:
mixture, post = common.init(X, K, seed)
mixture, post, cost = em.run(X, mixture, post)

In [5]:
X_fill = em.fill_matrix(X, mixture)
common.rmse(X_fill, X_true)

0.3152301205749674

In [6]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

In [7]:
# Load the test data set
# unstack so that each row has a single review
df = pd.read_csv('netflix_incomplete.txt', sep=' ', header=None)
df = df.unstack().reset_index()
df.columns=['Movie', 'User', 'Rating']
df = df[['User', 'Movie', 'Rating']]

In [8]:
# split the data set so that train has all nonzero ratings
# and test has the rest
df_train = df[df['Rating'] != 0]
df_test = df[df['Rating'] == 0].copy()

In [9]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
train = Dataset.load_from_df(df_train, reader)

In [38]:
trainset = train.build_full_trainset()

# Build an algorithm, and train it.
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25348dd8438>

In [18]:
testset = df_test.values
pred = algo.test(testset)

In [19]:
df_test_pred = pd.DataFrame(pred)
df_test_pred = df_test_pred.rename(index=str, columns={'uid':'User', 'iid':'Movie','est':'Rating'})
df_test_pred = df_test_pred[['User', 'Movie', 'Rating']]

In [20]:
X_SVD = pd.concat([df_train, df_test_pred]).set_index(['User', 'Movie']).unstack().values
X_true = np.loadtxt('netflix_complete.txt')
common.rmse(X_SVD, X_true)

0.5031110934478024

In [36]:
pred = algo.test(testset)
df_test_pred = pd.DataFrame(pred)
df_test_pred = df_test_pred.rename(index=str, columns={'uid':'User', 'iid':'Movie','est':'Rating'})
df_test_pred = df_test_pred[['User', 'Movie', 'Rating']]
X_SVD = pd.concat([df_train, df_test_pred]).set_index(['User', 'Movie']).unstack().values
common.rmse(X_SVD, X_true)

0.5014348232450729

In [44]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=4, refit=True)

gs.fit(train)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8887194472854804
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}


In [45]:
pred = gs.test(testset)
df_test_pred = pd.DataFrame(pred)
df_test_pred = df_test_pred.rename(index=str, columns={'uid':'User', 'iid':'Movie','est':'Rating'})
df_test_pred = df_test_pred[['User', 'Movie', 'Rating']]
X_SVD = pd.concat([df_train, df_test_pred]).set_index(['User', 'Movie']).unstack().values
common.rmse(X_SVD, X_true)

0.4353934182101176