In [1]:
import pandas as pd
import numpy as np

In [3]:
review = pd.read_csv('amazon_baby.csv')

In [4]:
review.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [6]:
review.shape

(13724, 3)

In [8]:
review.rating.value_counts()

5    7826
4    2558
1    1247
3    1227
2     866
Name: rating, dtype: int64

In [9]:
# let's create a dataframe with only 5 stars and 1 star reviews

best_worst_reviews = review[(review.rating==5) | (review.rating==1)]

In [10]:
best_worst_reviews.shape

(9073, 3)

In [None]:
# define X and y and we will split into training and testing sets..
# the text of the reviews will be our feature and the ratings will be our response

In [11]:
X = best_worst_reviews.review
y = best_worst_reviews.rating

In [13]:
print X.shape, y.shape

(9073,) (9073,)


In [None]:
# split X and y into training and testing datasets

In [14]:
from sklearn.cross_validation import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [16]:
X_train.shape

(6804,)

In [17]:
X_test.shape

(2269,)

In [19]:
print y_train.shape, y_test.shape

(6804,) (2269,)


In [None]:
# lets use countvectorizer to create our document term matrices from X_train and X_test

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
#instantiate countvectorizer

vector = CountVectorizer()

In [24]:
# fit and transform X_train into X_train_dtm (doc. term matrix)

# if you only pass X_train instead of X_train.values.astype -- you'll get an error

X_train_dtmatrix = vector.fit_transform(X_train.values.astype('U'))

## Even astype(str) would work


In [25]:
X_train_dtmatrix.shape

(6804, 14125)

In [None]:
# transform X_test into X_test_dtm (doc. term matrix)

In [27]:
X_test_dtmatrix = vector.transform(X_test.values.astype('U'))

In [28]:
X_test_dtmatrix.shape

(2269, 14125)

In [None]:
# let's use Naive bayes to predict rating in testing dataset

In [29]:
# import and instantiate MultinomialNB

In [32]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

In [None]:
# train model with X_train_dtmatrix

In [33]:
nb_model.fit(X_train_dtmatrix, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# predict for X_test_dtmatrix

In [36]:
y_pred = nb_model.predict(X_test_dtmatrix)

In [None]:
# lets calculate the accuracy of our prediciton

In [38]:
from sklearn import metrics

metrics.accuracy_score(y_test, y_pred)

0.92111062141912736

In [39]:
# lets see the confusion matrix

metrics.confusion_matrix(y_test, y_pred)

# 140 False Positive
# 39 False Negative

array([[ 165,  140],
       [  39, 1925]])

In [None]:
# lets see some of the false positives which are 1 star ratings but incorrectly classified as 
# 5 star ratings

In [40]:
X_test[y_test < y_pred].head(15)

10345    These are NOT the same nipples as in the hospi...
10986    Every child is going to be different in terms ...
2553     When we received these we followed the easy di...
3296     This is the worst child safety equipment there...
9043     This ball is huge and heavy for my 5 month old...
6858     This didn\'t work for my 23 month old. Althoug...
362      Can\'t get the smell of rank diapers out of th...
12924    Harness is all messed up. Its hard to get my s...
2612     I have  two of these covers for my changing ta...
8150     Squishy to an adults hands but the plastic and...
6150     I was hoping my grandson would be really stari...
7284     I\'ve heard good things about this system, but...
11558    The stamp didn\'t coat well so the image was v...
7023     Very nice design, but no action. I have to mea...
865      These were impossible to install.  We ended up...
Name: review, dtype: object

In [41]:
len(X_test[y_test < y_pred])

140

In [43]:
# our model is looking at the work 'nice'

X_test[7023]

'Very nice design, but no action. I have to measure in another bottle and empty here, more dirty. very helpful staff'

In [46]:
X_test[8150]

'Squishy to an adults hands but the plastic and the foam are too stiff for what we wanted,went with memory foam instead...ssssoooooooo much better'

In [None]:
# lets see some of the false negatives which are 5 star ratings but incorrectly classified as 
# 1 star ratings

In [47]:
X_test[y_test > y_pred].head(15)

2073     I have read the past few reviews given for the...
1834     I have overused this product and it\'s still g...
897      I\'d have to suggest that the other reviews we...
5618     Just wanted to say I had a very very very posi...
2159     I have been using these bottles for about 5 mo...
3207     With my first child, I used a Medela single el...
4886     I would like to state that I take issue with t...
8111     Really like this bumper pad. It is divided int...
9015     It looks beautiful. The price is very fair for...
1857     I tried a hospital pump & a highly advertised ...
8726     It\'s simple to use, and it WON\'T degrade the...
4047     I bought this from petsmart for $95.  So when ...
366      With my second child I wanted something better...
10296    This battery does what it is supposed to. We p...
10222    These make pumping so much better.  No leaning...
Name: review, dtype: object

In [None]:
# let's calculate some 10 or 15 tokens which are most predictive of 5 stars and 1 star rating

In [None]:
# naive bayes countes the number of times each token appears in each class and the num
# of observations in each class. use feature_count_ and class_count_

In [50]:
# store the vocabulary of X_train

X_train_tokens = vector.get_feature_names()

In [51]:
len(X_train_tokens)

14125

In [52]:
# first row is one star rating and second row is 5 star rating

nb_model.feature_count_.shape

(2, 14125)

In [None]:
# lets store the number of times each word (token) appears across each class

In [53]:
one_star_word = nb_model.feature_count_[0,:]
five_star_word = nb_model.feature_count_[1,:]

In [54]:
# let's create a dframe of words with thier separate one star and five star counts

words = pd.DataFrame({'word': X_train_tokens, 'one_star': one_star_word, \
                      'five_star': five_star_word }).set_index('word')

In [55]:
# lets add 1 to each of our counts to avoid dividing a number by zero

words['one_star'] = words.one_star + 1
words['five_star'] = words.five_star + 1

In [56]:
nb_model.class_count_   # 1st number is one star and 2nd number is five stars

array([  942.,  5862.])

In [58]:
from __future__ import division

In [59]:
# lets convert the one and five stars ratings into frequencies

words['one_star'] = words.one_star/ nb_model.class_count_[0]
words['five_star'] = words.five_star/ nb_model.class_count_[1]

In [60]:
# five star to one star ratio

words['five_to_one_ratio'] = words.five_star/words.one_star

In [61]:
#lets sort the dframe by descending order

words.sort_values('five_to_one_ratio', ascending=False).head(15)

Unnamed: 0_level_0,five_star,one_star,five_to_one_ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
guy,0.013647,0.001062,12.855681
trips,0.010747,0.001062,10.123849
chicco,0.010577,0.001062,9.963153
sleeps,0.010406,0.001062,9.802456
bedding,0.010406,0.001062,9.802456
loves,0.154555,0.016985,9.099411
awesome,0.027636,0.003185,8.677584
love,0.318321,0.037155,8.567393
birthday,0.009041,0.001062,8.516888
comfy,0.01723,0.002123,8.115148


In [62]:
words.sort_values('five_to_one_ratio', ascending=True).head(15)

Unnamed: 0_level_0,five_star,one_star,five_to_one_ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
refund,0.000171,0.024416,0.006987
unusable,0.000171,0.011677,0.014609
junk,0.000853,0.04034,0.021144
mum,0.000171,0.007431,0.022957
worst,0.001024,0.043524,0.023516
hinge,0.000171,0.006369,0.026783
freestyle,0.000171,0.006369,0.026783
slug,0.000171,0.006369,0.026783
glued,0.000171,0.006369,0.026783
chump,0.000171,0.006369,0.026783
