In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.decomposition import PCA
%matplotlib inline
import random
from sklearn import preprocessing
import scipy.sparse as sparse
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import graphlab as gl

## Overview

Used graphLab Library to use Matrix Factorization.  Best Test RMSE achieved was .76 with a regularization parameter of 1e-03

In [2]:
filename = "datasets/frequent_reviews.txt"
df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,unixReviewTime,reviewText,overall,reviewTime,summary
0,A9GPEIJ6JS3UE,B000JMLBHU,Amazon Customer,"[0, 0]",1350432000,"This classic is a must read, especial for anyo...",4.0,"10 17, 2012",Jules Verne is arguably one of the great writters
1,ABDR6IJ93HFIO,B000JMLBHU,"Daisy ""Daisy S""","[0, 0]",1394409600,"Read this book years ago and enjoyed it then, ...",5.0,"03 10, 2014","A story of determination, a group finding answ..."
2,AU58Q4M7NCYC9,B000JMLBHU,"David G ""science fiction and fantasy fanatic""","[0, 0]",1404000000,"Every child should read 20,000 leagues under t...",4.0,"06 29, 2014",a classic every child should read - sequel to ...
3,A32A056Q9OYP7D,B000JMLBHU,Ricky KImsey,"[0, 0]",1388016000,A group of men escape by balloon a Confederate...,5.0,"12 26, 2013",Island Mystery
4,A71W7G4TCTH3T,B002AJ7X2C,"BBMoreB ""Heather Coulter""","[1, 2]",1245110400,This short story about 2 serial killers is bot...,4.0,"06 16, 2009",3 Distrubing Chapters - Wish it was longer!


In [4]:
df.shape

(14433, 9)

In [5]:
sf = gl.SFrame({'user_id': df.reviewerID,
                       'item_id': df.asin,
                       'rating': df.overall})

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1481115585.log


This non-commercial license of GraphLab Create for academic use is assigned to atsoroka@g.harvard.edu and will expire on December 03, 2017.


In [6]:
sf

item_id,rating,user_id
B000JMLBHU,4.0,A9GPEIJ6JS3UE
B000JMLBHU,5.0,ABDR6IJ93HFIO
B000JMLBHU,4.0,AU58Q4M7NCYC9
B000JMLBHU,5.0,A32A056Q9OYP7D
B002AJ7X2C,4.0,A71W7G4TCTH3T
B002AJ7X2C,4.0,A30YDY4XKVUBUY
B002AJ7X2C,5.0,A2KWQ64TRHB3YH
B002AJ7X2C,1.0,ASA5RSXOAVT12
B002AJ7X2C,3.0,A3KI93AT8FAXI1
B002AJ7X2C,2.0,A3A7FF87LEVCQ1


In [7]:
train, valid = gl.recommender.util.random_split_by_user(sf, max_num_users=None, item_test_proportion=0.4)

In [8]:
train

item_id,rating,user_id
B000JMLBHU,4.0,A9GPEIJ6JS3UE
B000JMLBHU,5.0,ABDR6IJ93HFIO
B002AJ7X2C,1.0,ASA5RSXOAVT12
B002D48NBO,2.0,A14R9XMZVJ6INB
B002D48NBO,5.0,A3CNWYFJPBXT2N
B002D48NBO,5.0,A2U8YWPP1PYHJM
B002D48NBO,4.0,A31L0XOY4GVRR8
B002D48NBO,2.0,AE41TLMIZPAE7
B0031Y9CPG,5.0,A195CNOUUIT4SU
B0031Y9CPG,4.0,A1SPKCPR0BAZMY


In [9]:
valid

item_id,rating,user_id
B000JMLBHU,4.0,AU58Q4M7NCYC9
B000JMLBHU,5.0,A32A056Q9OYP7D
B002AJ7X2C,4.0,A71W7G4TCTH3T
B002AJ7X2C,4.0,A30YDY4XKVUBUY
B002AJ7X2C,5.0,A2KWQ64TRHB3YH
B002AJ7X2C,3.0,A3KI93AT8FAXI1
B002AJ7X2C,2.0,A3A7FF87LEVCQ1
B002AJ7X2C,1.0,AVTOKS08HOEXG
B002AJ7X2C,5.0,A14QJKY3DA19AP
B0031Y9CPG,3.0,A1RRMF5XW5NZDD


In [10]:
m1 = gl.factorization_recommender.create(train, target='rating')

In [11]:
eval1 = m1.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00663129973475 | 0.00419982316534 |
|   2    | 0.00596816976127 | 0.0045352077413  |
|   3    | 0.00574712643678 | 0.00512725495138 |
|   4    | 0.0053050397878  | 0.00622141940761 |
|   5    | 0.00450928381963 | 0.0062582599617  |
|   6    | 0.00486295313882 | 0.00684625010895 |
|   7    | 0.00454717696097 | 0.00706155204839 |
|   8    | 0.0058023872679  | 0.00825976111131 |
|   9    | 0.0053050397878  | 0.00859132609805 |
|   10   | 0.00503978779841 | 0.00888281180068 |
+--------+------------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.7630496561739826)

Per User RMSE (best)
+---------------+-------+------------------+
|    user_id    | count |       rmse       |
+---------------+-------+------------------+
| AT7O1ONKZSY3X |   1   | 0.00733930283244

In [12]:
m2 = gl.factorization_recommender.create(train, target='rating',regularization=1e-03 )

In [13]:
eval2 = m2.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+--------+------------------+-------------------+
|   1    | 0.0053050397878  | 0.000335384575957 |
|   2    | 0.00596816976127 |  0.0045352077413  |
|   3    | 0.00574712643678 |  0.00512725495138 |
|   4    | 0.0053050397878  |  0.00622141940761 |
|   5    | 0.00557029177719 |  0.00680940955487 |
|   6    | 0.00508399646331 |  0.00702471149431 |
|   7    | 0.00644183402804 |  0.00822292055723 |
|   8    | 0.0058023872679  |  0.00855448554397 |
|   9    | 0.0053050397878  |  0.00859132609805 |
|   10   | 0.0053050397878  |  0.0100754741339  |
+--------+------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.7590078511771912)

Per User RMSE (best)
+----------------+-------+------------------+
|    user_id     | count |       rmse       |
+----------------+-------+------------------+
| A103M7E0BSFC83 |   2   

In [14]:
m3 = gl.factorization_recommender.create(train, target='rating',regularization=1e-06 )

In [15]:
eval3 = m3.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00663129973475 | 0.00419982316534 |
|   2    | 0.00596816976127 | 0.0045352077413  |
|   3    | 0.00574712643678 | 0.00512725495138 |
|   4    | 0.0053050397878  | 0.00622141940761 |
|   5    | 0.00557029177719 | 0.00680940955487 |
|   6    | 0.00486295313882 | 0.00684625010895 |
|   7    | 0.00454717696097 | 0.00706155204839 |
|   8    | 0.0058023872679  | 0.00825976111131 |
|   9    | 0.00515767757147 | 0.00825976111131 |
|   10   | 0.00490716180371 | 0.00869334609397 |
+--------+------------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.7632082877281781)

Per User RMSE (best)
+----------------+-------+------------------+
|    user_id     | count |       rmse       |
+----------------+-------+------------------+
| A1DA6E4FNRSAWN |   1   | 0.0055496940

In [16]:
m4 = gl.factorization_recommender.create(train, target='rating',regularization=1e-01 )

In [17]:
eval4 = m4.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+-----------------+------------------+
| cutoff |  mean_precision |   mean_recall    |
+--------+-----------------+------------------+
|   1    | 0.0198938992042 | 0.00200063609293 |
|   2    | 0.0225464190981 | 0.00440916441343 |
|   3    | 0.0190097259063 | 0.00567683333109 |
|   4    | 0.0159151193634 | 0.00615553350553 |
|   5    | 0.0153846153846 | 0.00770089472966 |
|   6    | 0.0152519893899 | 0.00979401234002 |
|   7    | 0.0170519136036 |  0.012352256971  |
|   8    | 0.0160809018568 | 0.0134535088482  |
|   9    | 0.0160624815797 | 0.0150572610985  |
|   10   | 0.0153846153846 | 0.0157257376754  |
+--------+-----------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.79414476574277)

Per User RMSE (best)
+----------------+-------+----------------+
|    user_id     | count |      rmse      |
+----------------+-------+----------------+
| A30YDY4XKVUBUY |   1   | 0.110632149189 |
+----------------

In [18]:
m5 = gl.factorization_recommender.create(train, target='rating',regularization=1e-04 )

In [19]:
eval4 = m4.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+-----------------+------------------+
| cutoff |  mean_precision |   mean_recall    |
+--------+-----------------+------------------+
|   1    | 0.0198938992042 | 0.00200063609293 |
|   2    | 0.0225464190981 | 0.00440916441343 |
|   3    | 0.0190097259063 | 0.00567683333109 |
|   4    | 0.0159151193634 | 0.00615553350553 |
|   5    | 0.0153846153846 | 0.00770089472966 |
|   6    | 0.0152519893899 | 0.00979401234002 |
|   7    | 0.0170519136036 |  0.012352256971  |
|   8    | 0.0160809018568 | 0.0134535088482  |
|   9    | 0.0160624815797 | 0.0150572610985  |
|   10   | 0.0153846153846 | 0.0157257376754  |
+--------+-----------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.79414476574277)

Per User RMSE (best)
+----------------+-------+----------------+
|    user_id     | count |      rmse      |
+----------------+-------+----------------+
| A30YDY4XKVUBUY |   1   | 0.110632149189 |
+----------------