# Data loading & preprocessing


In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [2]:
data_file = 'D:/Datasets/amazon_reviews/Video_Games_5.json'

In [3]:
df = pd.read_json(data_file, lines = True)

In [4]:
df.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400


In [5]:
df['asin'].nunique()

10672

In [6]:
df['reviewerID'].nunique()

24303

In [19]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=42)

In [20]:
df_train = df_train.copy()

In [21]:
df_train.size

1564515

In [22]:
df_test.size

521505

In [23]:
indexes = df_train.overall > 0

In [24]:
mean_ = df_train[indexes].groupby('asin')['overall'].mean()

In [25]:
df_train['overall'] = df_train.apply(lambda x: x.overall - mean_[x.asin] if x.overall != 0 else 0, axis = 1)

In [27]:
df_train.head(5)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
164069,B003XDRIYC,"[0, 0]",0.3,These Game reminds of Different Games from the...,"01 15, 2013",ALG4EPVIH8MNY,"Cindi Svoboda ""Daniel J""",A Fun and Awesome Game.,1358208000
190366,B0056WJA6M,"[1, 1]",0.885714,Def as good as the first one. It was a nice ch...,"03 16, 2013",A2T7VQOECHJ5JW,Tammi Timmons,Great game,1363392000
10576,B00004SVTL,"[0, 0]",0.75,I legitimately remember saying that to my moth...,"03 9, 2014",A26W7B1A24LBJM,OhNoMyTurn,"""I want to be a Rad Racer!""",1394323200
66439,B000F1WDHW,"[0, 0]",-0.224138,Not a bad option for 360 headset. Plug is not ...,"10 18, 2009",A17LDNW8GIFID1,J. Moore,Serviceable,1255824000
169601,B004CVQUGQ,"[0, 0]",-0.705882,Champion mode is a complete joke. It's like t...,"03 21, 2012",A1H5XDZZQLFLFT,Super,Expected so much more,1332288000


# Simple model with CF

In [28]:
asin = CategoricalDtype(sorted(df_train.asin.unique()), ordered=True)
rev_id = CategoricalDtype(sorted(df_train.reviewerID.unique()), ordered=True)

row_cat = df_train.reviewerID.astype(rev_id).cat
col_cat = df_train.asin.astype(asin).cat

row = row_cat.codes
col = col_cat.codes

sparse_matrix = csr_matrix((df_train["overall"], (row, col)), \
                           shape=(rev_id.categories.size, asin.categories.size), dtype = 'd')

In [29]:
rating_mx = sparse_matrix

In [14]:
rating_mx[rating_mx > 0] = rating_mx[rating_mx > 0] - user_ratings_mean

ValueError: shape mismatch in assignment

In [13]:
#rating_mx = df_pivot.values
user_ratings_mean =  rating_mx.sum(1) / (rating_mx > 0).sum(1)
#rating_mx = rating_mx - user_ratings_mean # - user_ratings_mean.reshape(-1, 1)

In [60]:
print("Mean user rating: \n" + str(user_ratings_mean))

Mean user rating: 
[[5.        ]
 [4.28571429]
 [4.66666667]
 ...
 [3.4       ]
 [4.33333333]
 [4.28571429]]


In [30]:
U, sigma, Vt = svds(rating_mx, k = 50)
sigma = np.diag(sigma)

In [31]:
print ('Shapes: ')
print ('U \t {0}'.format(U.shape))
print ('sigma \t {0}'.format(sigma.shape))
print ('Vt \t {0}'.format(Vt.shape))


Shapes: 
U 	 (24293, 50)
sigma 	 (50, 50)
Vt 	 (50, 10671)


# Make predictions

In [32]:
all_user_predicted_ratings = csr_matrix(U) * csr_matrix(sigma) * csr_matrix(Vt)
#all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
all_user_predicted_ratings = all_user_predicted_ratings + user_ratings_mean

In [33]:
reviewers = df_train.reviewerID.astype(rev_id).cat.categories
asins = df_train.asin.astype(asin).cat.categories

preds_df = pd.DataFrame(all_user_predicted_ratings.todense(), columns = asins, index = reviewers)

In [34]:
preds_df.index.name = 'reviewerID'
preds_df.columns.name = 'asin'

preds_df.head(2)

asin,0700099867,6050036071,7100027950,7293000936,8176503290,907843905X,9625990674,9861019731,9882155456,B000003SQQ,...,B00J128FPA,B00J226358,B00J6DLPLK,B00J9P3KBS,B00JM3R6M6,B00JQ8YH6A,B00JQHU9RC,B00JXW6GE0,B00KAI3KW2,B00KHECZXO
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00263941WP7WCIL7AKWL,-1.889716e-07,4.796539e-08,3.580005e-07,-3.177841e-08,-3.37307e-08,-4.712821e-08,2.988064e-08,3.547024e-22,2.967329e-07,4.054175e-22,...,-5.208379e-07,5e-06,3.586771e-07,6.892489e-08,3e-06,-8.733546e-08,2.452015e-07,1.906143e-07,3e-06,-1.032144e-07
A005481137I9SCAWEF7ON,-0.00187073,0.0001284753,0.0008128109,-0.0001451596,-8.0253e-05,-7.605974e-05,0.0001346054,4.382931e-19,0.0002535152,4.547477e-19,...,-0.0003029652,0.002961,5.642068e-05,-0.0003238605,0.000887,-1.62222e-05,-0.0003670354,0.0002608258,-0.004388,-5.659587e-05


In [35]:
preds = preds_df.reset_index().melt('reviewerID', var_name='asin')

In [38]:
preds.head(2)

Unnamed: 0,reviewerID,asin,value
0,A00263941WP7WCIL7AKWL,700099867,-1.889716e-07
1,A005481137I9SCAWEF7ON,700099867,-0.00187073


In [40]:
preds.shape

(259230603, 3)

In [41]:
preds['value_meaned'] = 0

MemoryError: 

In [45]:
preds['value_meaned'] = preds.apply(lambda x: x.value + mean_[x.asin], axis = 1)

KeyboardInterrupt: 

In [None]:
preds.head(10)

In [None]:
preds.to_json('D:/Datasets/amazon_reviews/preds.json')

In [60]:
df_test.shape

(57945, 10)

In [74]:
col_cat_map = dict((v, k) for k, v in enumerate(col_cat.categories))
row_cat_map = dict((v, k) for k, v in enumerate(row_cat.categories))

In [45]:
df_test['pred'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [73]:
i = 0

def xexe(x):
    _asin = x.asin
    _revID = x.reviewerID
    
    row_ind = row_cat_map[_revID]
    col_ind = col_cat_map[_asin]
    
    rate = U[row_ind] * sigma * Vt[:, col_ind]
    
    return rate

df_test.apply(xexe, axis = 1)


KeyError: ('A142CS0PXYWZSM', 'occurred at index 146182')

In [75]:
all_user_predicted_ratings.shape

NameError: name 'all_user_predicted_ratings' is not defined

In [85]:
'A2T7VQOECHJ5JW' in df_train.reviewerID

False

In [89]:
df_train.reviewerID.astype(str)

164069     ALG4EPVIH8MNY
190366    A2T7VQOECHJ5JW
10576     A26W7B1A24LBJM
66439     A17LDNW8GIFID1
169601    A1H5XDZZQLFLFT
116846    A2FTHCGH06O4Y5
69845      AO9DI9XIIMY5A
14875      A96JD9312DHWC
210471     A1K97WR63DBXV
22114     A2BH04B9G9LOYA
140175    A2FYX0BC3D214U
165472    A3BEELRT5V9REJ
44847     A3T3C3B8POTJ8M
75136     A3KQE3BCGDP75D
94173     A27Z9FV4D0EYHT
183721    A1QUF25ZMY25V3
215096     AVEFL9T9PJF1T
139695    A2NH6XSE79X3VY
30317      AZ3SMS383Q9ZD
202065    A2KT5MP35EMH7Q
109004    A3DPA4KUC2E7HZ
138065    A1510WM4V9MICG
217581    A29OLG6AJZLKNN
114459    A25NSTXNN16KG8
107182    A3V6Z4RCDGRC44
104712    A1AQ03L7J8MG4T
66053     A1JV3E2M4AOOOV
190417    A26DQMYF5E0GL6
214190    A208G0AAI1XURB
194873    A3GTHY4BJ334L1
               ...      
85305     A1P9RKER6DKZWU
184779     A4GSQJBIIWK5I
214176    A2AXJVLQ5O5S20
103355    A1YLB3KK98OXNL
5311       A7I064UDWGSYE
199041    A1TR1KNR3VKPHY
64925      AXWPSHRQL910I
194027    A3VIKS0DB6M2V1
59735     A367OB1E03A8QM


In [67]:
reviewers = df_train.reviewerID.astype(rev_id).cat.categories
asins = df_train.reviewerID.astype(asin).cat.categories
preds_df = pd.DataFrame(all_user_predicted_ratings.todense(), columns = asins, index = reviewers)

In [76]:
preds_df.index.name = 'reviewerID'
preds_df.columns.name = 'asin'
preds_df.head(2)

asin,0700099867,6050036071,7100027950,7293000936,8176503290,907843905X,9625990674,9861019731,9882155456,B000003SQQ,...,B00J128FPA,B00J226358,B00J6DLPLK,B00J9P3KBS,B00JM3R6M6,B00JQ8YH6A,B00JQHU9RC,B00JXW6GE0,B00KAI3KW2,B00KHECZXO
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00263941WP7WCIL7AKWL,-6.4e-05,5e-06,8.9e-05,4e-05,4.584626e-07,-1e-06,1.9e-05,2.3e-05,-5.3e-05,-4.3e-05,...,-2e-05,8.3e-05,-9.5e-05,5.6e-05,-7.6e-05,-3e-06,0.000322,8.8e-05,7.2e-05,6e-06
A005481137I9SCAWEF7ON,-0.004291,-0.001798,-0.000342,-0.001443,0.0004256465,-0.00034,-0.00215,-0.000423,0.004522,0.004788,...,0.00212,-0.000838,-0.000335,0.003907,-0.003498,-7.2e-05,0.005431,0.00667,0.003296,0.001924


In [77]:
preds = preds_df.reset_index().melt('reviewerID', var_name='asin')

In [78]:
preds.head(2)

Unnamed: 0,reviewerID,asin,value
0,A00263941WP7WCIL7AKWL,700099867,-6.4e-05
1,A005481137I9SCAWEF7ON,700099867,-0.004291


In [79]:
np.sum(all_user_predicted_ratings > 3)

8332

In [72]:
np.sum(df_pivot.values > 3)

NameError: name 'df_pivot' is not defined

In [62]:
# test
r = np.array([[0, 3, 0, 5], [3, 2, 1, 0], [0, 4, 5, 5], [2, 0, 1, 0]], dtype = 'd')
m = np.mean(r, axis = 1).reshape(-1, 1)
_U, _sigma, _Vt = svds(r-m, k = 2)

In [63]:
_sigma = np.diag(_sigma)
np.dot(np.dot(_U, _sigma), _Vt) + m

array([[-5.26723614e-03,  3.01980648e+00, -2.54126749e-03,
         4.98800202e+00],
       [ 3.19639590e+00,  1.26148892e+00,  1.09475453e+00,
         4.47360644e-01],
       [ 6.02917758e-02,  3.77328374e+00,  5.02908879e+00,
         5.13733570e+00],
       [ 1.87926551e+00,  4.54000098e-01,  9.41749597e-01,
        -2.75015207e-01]])

# Metrics

In [46]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (1 entries)


In [43]:
df_merged = df_test.merge(preds, on=['reviewerID', 'asin'])

In [47]:
df_merged.head(10)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,value
0,B0047TLIBU,"[31, 43]",5,"Been playing this for two days now, and I am v...","03 10, 2011",A17M1HL6U2GS7M,Storylover,It feels like hanging out with old friends aga...,1299715200,-0.0008699664
1,B0053BCP40,"[0, 0]",4,"I really enjoyed playing this game, although i...","02 27, 2014",A1FGU7F9UJ264N,Kelly Randall,Fun game,1393459200,0.02010041
2,B000WCQWR6,"[0, 0]",3,This was kind of a gamble and was looking for ...,"07 7, 2013",A3R7M2EP1EYNVT,Thomas B. Hileman Jr.,Shoot em up,1373155200,7.363616e-05
3,B00498T500,"[4, 4]",4,***Updated 7/7/2011 - update marked below***I ...,"04 10, 2011",A369KP0JV77JYT,Jon,"Fun for a group, don't pay too much for it",1302393600,-0.0003260203
4,B00005BIG7,"[3, 3]",5,It looks like we've come a long way from past ...,"07 13, 2001",ABIKTKAWOMY8,Chance Farley,One of the best fps's to ever hit a console,994982400,0.006254228
5,B002CYWJWO,"[7, 8]",4,"Tellurye, here with another review brought to ...","10 30, 2009",A2SYA663GK3EEJ,"R. Stevens ""Fedora Technologies""","Does the Saw universe justice, but controls ma...",1256860800,0.0003067033
6,B001V9PTVE,"[2, 2]",4,Fallout 2 is worth the price alone... but if y...,"09 6, 2009",A39AUQ3IP3C8KI,J. Nicholson,Worth it (even if it was just FO2),1252195200,-1.380123e-05
7,B00DBDPOZ4,"[1, 1]",5,"As I'm not a fan of AA batteries, this accesso...","12 4, 2013",A22N09ZP9D77DM,D. Alvarado,Great upgrade over the Xbox 360 version.,1386115200,-0.05349924
8,B0009A4EV2,"[4, 4]",5,"Next to the Final Fantasy series, the Dragon Q...","03 22, 2007",AJKWF4W7QD4NS,"N. Durham ""Big Evil""","One of the best games, ever, for the PS2",1174521600,0.08465834
9,B00002EQAP,"[0, 0]",5,"This game is so great, not just for the crazy ...","08 20, 2013",A1T4HGVX32QIYC,Will Ramirez,Great Game,1376956800,3.304022e-07


In [49]:
df_merged['value_meaned'] = df_merged.apply(lambda x: x.value + mean_[x.asin], axis = 1)

In [50]:
df_merged.head(10)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,value,value_meaned
0,B0047TLIBU,"[31, 43]",5,"Been playing this for two days now, and I am v...","03 10, 2011",A17M1HL6U2GS7M,Storylover,It feels like hanging out with old friends aga...,1299715200,-0.0008699664,3.66042
1,B0053BCP40,"[0, 0]",4,"I really enjoyed playing this game, although i...","02 27, 2014",A1FGU7F9UJ264N,Kelly Randall,Fun game,1393459200,0.02010041,4.487492
2,B000WCQWR6,"[0, 0]",3,This was kind of a gamble and was looking for ...,"07 7, 2013",A3R7M2EP1EYNVT,Thomas B. Hileman Jr.,Shoot em up,1373155200,7.363616e-05,3.975074
3,B00498T500,"[4, 4]",4,***Updated 7/7/2011 - update marked below***I ...,"04 10, 2011",A369KP0JV77JYT,Jon,"Fun for a group, don't pay too much for it",1302393600,-0.0003260203,4.666341
4,B00005BIG7,"[3, 3]",5,It looks like we've come a long way from past ...,"07 13, 2001",ABIKTKAWOMY8,Chance Farley,One of the best fps's to ever hit a console,994982400,0.006254228,4.373601
5,B002CYWJWO,"[7, 8]",4,"Tellurye, here with another review brought to ...","10 30, 2009",A2SYA663GK3EEJ,"R. Stevens ""Fedora Technologies""","Does the Saw universe justice, but controls ma...",1256860800,0.0003067033,3.714592
6,B001V9PTVE,"[2, 2]",4,Fallout 2 is worth the price alone... but if y...,"09 6, 2009",A39AUQ3IP3C8KI,J. Nicholson,Worth it (even if it was just FO2),1252195200,-1.380123e-05,3.588221
7,B00DBDPOZ4,"[1, 1]",5,"As I'm not a fan of AA batteries, this accesso...","12 4, 2013",A22N09ZP9D77DM,D. Alvarado,Great upgrade over the Xbox 360 version.,1386115200,-0.05349924,4.397481
8,B0009A4EV2,"[4, 4]",5,"Next to the Final Fantasy series, the Dragon Q...","03 22, 2007",AJKWF4W7QD4NS,"N. Durham ""Big Evil""","One of the best games, ever, for the PS2",1174521600,0.08465834,4.474489
9,B00002EQAP,"[0, 0]",5,"This game is so great, not just for the crazy ...","08 20, 2013",A1T4HGVX32QIYC,Will Ramirez,Great Game,1376956800,3.304022e-07,3.333334


In [52]:
mean_squared_error(df_merged['overall'], df_merged['value_meaned'])  

1.3006931297897357

# optimizations
50 dim = mse 17.454677326070133
150 dim = mse 17.465798414170884