In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.decomposition import PCA
%matplotlib inline
import random
from sklearn import preprocessing
import scipy.sparse as sparse
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import graphlab as gl

## Overview

Used graphLab Library to use Matrix Factorization.  Best Test RMSE achieved was .86 with a regularization parameter of 1e-03

In [3]:
filename = "datasets/reviews.csv"
df = pd.read_csv(filename)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [5]:
df.shape

(982619, 10)

In [6]:
sf = gl.SFrame({'user_id': df.reviewerID,
                       'item_id': df.asin,
                       'rating': df.overall})

This non-commercial license of GraphLab Create for academic use is assigned to atsoroka@g.harvard.edu and will expire on December 03, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1481584627.log


In [7]:
sf

item_id,rating,user_id
B000F83SZQ,5,A1F6404F1VG29J
B000F83SZQ,4,AN0N05A9LIJEQ
B000F83SZQ,4,A795DMNCJILA6
B000F83SZQ,5,A1FV0SX13TWVXQ
B000F83SZQ,4,A3SPTOKDG7WBLN
B000F83SZQ,4,A1RK2OCZDSGC6R
B000F83SZQ,4,A2HSAKHC3IBRE6
B000F83SZQ,4,A3DE6XGZ2EPADS
B000FA64PA,5,A1UG4Q4D3OAH3A
B000FA64PA,4,AQZH7YTWQPOBE


In [8]:
train, valid = gl.recommender.util.random_split_by_user(sf, max_num_users=None, item_test_proportion=0.4)

In [9]:
train

item_id,rating,user_id
B000F83SZQ,4,A795DMNCJILA6
B000F83SZQ,4,A3SPTOKDG7WBLN
B000F83SZQ,4,A2HSAKHC3IBRE6
B000FA64PA,5,A1UG4Q4D3OAH3A
B000FA64PA,4,AQZH7YTWQPOBE
B000FA64PA,5,A1ZT7WV0ZUA0OJ
B000FA64PA,4,A2ZFR72PT054YS
B000FA64PA,3,A2QK1U70OJ74P
B000FA64PK,3,A3SZMGJMV0G16C
B000FA64PK,5,A2EN84QHDRZLP2


In [10]:
valid

item_id,rating,user_id
B000F83SZQ,5,A1F6404F1VG29J
B000F83SZQ,4,AN0N05A9LIJEQ
B000F83SZQ,5,A1FV0SX13TWVXQ
B000F83SZQ,4,A1RK2OCZDSGC6R
B000F83SZQ,4,A3DE6XGZ2EPADS
B000FA64PK,5,A3H8PE1UFK04JZ
B000FA64PK,5,A1UG4Q4D3OAH3A
B000FA64PK,3,A38Z3Q6DTDIH9J
B000FA64PK,4,A22CW0ZHY3NJH8
B000FA64QO,2,A3SZMGJMV0G16C


In [11]:
m1 = gl.factorization_recommender.create(train, target='rating')

In [12]:
eval1 = m1.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    | 0.000120782063864 | 1.56137740119e-05 |
|   2    | 0.000166075337812 | 5.34454688656e-05 |
|   3    | 0.000156010165824 | 7.52188255549e-05 |
|   4    | 0.000143428700838 | 9.71824687158e-05 |
|   5    |  0.00013286027025 |  0.00011266715987 |
|   6    | 0.000130847235852 | 0.000141705615979 |
|   7    | 0.000138036644415 | 0.000164926326531 |
|   8    | 0.000154752019325 | 0.000195527143501 |
|   9    | 0.000150977579829 | 0.000210421906419 |
|   10   | 0.000152487355628 | 0.000241537198824 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9663450445826951)

Per User RMSE (best)
+----------------+-------+-------------------+
|    user_id     | count |        rmse       |
+----------------+-------+-------------------+
| A3ANIY

In [13]:
m2 = gl.factorization_recommender.create(train, target='rating',regularization=1e-03 )

In [14]:
eval2 = m2.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+--------+------------------+-------------------+
|   1    | 0.00187212198988 | 0.000532304264993 |
|   2    | 0.00173624216804 |  0.00103515046465 |
|   3    | 0.00150977579829 |  0.00133061041632 |
|   4    | 0.00152109911678 |  0.00179932054593 |
|   5    | 0.00131954404771 |  0.00193413014567 |
|   6    | 0.00123298356861 |  0.0021383664727  |
|   7    | 0.00123154568689 |  0.00234883713099 |
|   8    | 0.00118517400166 |  0.0024941795279  |
|   9    | 0.00125311391258 |  0.00277302630111 |
|   10   | 0.00124858458519 |  0.00303297210357 |
+--------+------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.8603735738806225)

Per User RMSE (best)
+----------------+-------+------------------+
|    user_id     | count |       rmse       |
+----------------+-------+------------------+
| A3KEOLR9OKKB1W |   1   

In [15]:
m3 = gl.factorization_recommender.create(train, target='rating',regularization=1e-06 )

In [16]:
eval3 = m3.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    | 0.000181173095795 | 2.82836366218e-05 |
|   2    | 0.000196270853778 | 6.78315982934e-05 |
|   3    | 0.000226466369744 | 0.000108636236648 |
|   4    | 0.000215143051257 | 0.000130900161666 |
|   5    | 0.000214388163358 |  0.00017442247557 |
|   6    | 0.000216401197755 |  0.00021572539085 |
|   7    | 0.000215682256899 | 0.000235884971065 |
|   8    | 0.000211368611761 |   0.000260125651  |
|   9    | 0.000209691083096 |  0.00030840342491 |
|   10   | 0.000217407714954 | 0.000346146790489 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9463132478326678)

Per User RMSE (best)
+---------------+-------+-------------------+
|    user_id    | count |        rmse       |
+---------------+-------+-------------------+
| A7WED2NNB

In [17]:
m4 = gl.factorization_recommender.create(train, target='rating',regularization=1e-01 )

In [18]:
eval4 = m4.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00705065297803 | 0.00222729856788 |
|   2    | 0.00471050049068 | 0.00296296999819 |
|   3    | 0.00412672051534 | 0.00344140852491 |
|   4    | 0.00367252962935 | 0.00421864316611 |
|   5    | 0.00325809617272 | 0.00465181009588 |
|   6    | 0.00291135099771 | 0.00496870615068 |
|   7    | 0.00257308932481 | 0.00510986234403 |
|   8    | 0.00261191213105 | 0.00562211227329 |
|   9    | 0.00261191213105 | 0.00619825612554 |
|   10   | 0.00260285347626 | 0.00671826788424 |
+--------+------------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9518764766962728)

Per User RMSE (best)
+----------------+-------+----------------+
|    user_id     | count |      rmse      |
+----------------+-------+----------------+
| A2TEZE9WMSUB5E |   1   | 0.267068258172 |
+

In [19]:
m5 = gl.factorization_recommender.create(train, target='rating',regularization=1e-04 )

In [20]:
eval4 = m4.evaluate(valid)


Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00705065297803 | 0.00222729856788 |
|   2    | 0.00471050049068 | 0.00296296999819 |
|   3    | 0.00412672051534 | 0.00344140852491 |
|   4    | 0.00367252962935 | 0.00421864316611 |
|   5    | 0.00325809617272 | 0.00465181009588 |
|   6    | 0.00291135099771 | 0.00496870615068 |
|   7    | 0.00257308932481 | 0.00510986234403 |
|   8    | 0.00261191213105 | 0.00562211227329 |
|   9    | 0.00261191213105 | 0.00619825612554 |
|   10   | 0.00260285347626 | 0.00671826788424 |
+--------+------------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9518764766962728)

Per User RMSE (best)
+----------------+-------+----------------+
|    user_id     | count |      rmse      |
+----------------+-------+----------------+
| A2TEZE9WMSUB5E |   1   | 0.267068258172 |
+