In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
import nimfa
import pickle

In [2]:
data_file = 'D:/Datasets/amazon_reviews/Video_Games_5.json'
df = pd.read_json(data_file, lines = True)

In [3]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=42)

In [4]:
df_train = df_train.copy()

In [5]:
asin = CategoricalDtype(sorted(df_train.asin.unique()), ordered=True)
rev_id = CategoricalDtype(sorted(df_train.reviewerID.unique()), ordered=True)

row_cat = df_train.reviewerID.astype(rev_id).cat
col_cat = df_train.asin.astype(asin).cat

row = row_cat.codes
col = col_cat.codes

sparse_matrix = csr_matrix((df_train["overall"].values, (row, col)), \
                           shape=(rev_id.categories.size, asin.categories.size), dtype = 'd')

In [6]:
ratings = sparse_matrix

In [7]:
ratings

<24293x10671 sparse matrix of type '<class 'numpy.float64'>'
	with 173835 stored elements in Compressed Sparse Row format>

In [None]:
pmf = nimfa.Psmf(ratings, rank=50, max_iter=20, rel_error=1e-5)
pmf_fit = pmf()

In [None]:

with open(r'D:/Datasets/amazon_reviews/nonnorm_basis_mx.pickle', 'wb') as handle:
    pickle.dump(pmf_fit.basis(), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(r'D:/Datasets/amazon_reviews/nonnorm_mixture_mx.pickle', 'wb') as handle:
    pickle.dump(pmf_fit.coef(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
W = pmf_fit.basis()
print('Basis matrix:\n%s' % W)

H = pmf_fit.coef()
print('Mixture matrix:\n%s' % H)

print('Euclidean distance: %5.3f' % pmf_fit.distance(metric='euclidean'))

sm = pmf_fit.summary()

print('Sparseness Basis: %5.3f  Mixture: %5.3f' % (sm['sparseness'][0], sm['sparseness'][1]))
print('Iterations: %d' % sm['n_iter'])
#print('Target estimate:\n%s' % np.dot(W, H))

Basis matrix:
  (0, 5)	0.023156722217591585
  (1, 20)	0.7610743557452117
  (2, 4)	0.2992633945594552
  (3, 17)	0.24382222577928242
  (4, 27)	0.19421075821283817
  (5, 39)	0.21971577307553888
  (6, 22)	0.49939520838331686
  (7, 38)	1.6806536784754793
  (8, 1)	0.5479486827884723
  (9, 6)	1.8097367730884228
  (10, 47)	0.13499468148379706
  (11, 22)	1.378917837826323
  (12, 35)	0.7391256854302848
  (13, 37)	0.13628858828728982
  (14, 16)	0.07506045508560898
  (15, 47)	0.7864263301166464
  (16, 16)	0.41231159233822995
  (17, 31)	0.9637507382257064
  (18, 17)	1.0492960026101936
  (19, 13)	0.5465059326552174
  (20, 4)	0.4284801184498929
  (21, 3)	0.4849341005306441
  (22, 20)	0.7244761857325567
  (23, 47)	1.143193366043764
  (24, 3)	1.0308296879293206
  :	:
  (24269, 24)	2.4188285700149534
  (24270, 46)	0.6226047954224774
  (24271, 39)	0.5002874601258571
  (24272, 43)	0.03831876720786253
  (24273, 3)	0.6906479794319914
  (24274, 10)	0.0999041406963953
  (24275, 8)	0.1202608832937059
  (24276,