In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



## Load Amazon datasets

In [6]:
products = pd.read_csv('amazon_baby.csv')
products.head(2)

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5


In [7]:
products.shape

(183531, 3)

In [8]:
products.dtypes

name      object
review    object
rating     int64
dtype: object

## Perform text cleaning

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

*In this notebook, we remove all punctuation for the sake of simplicity. A smarter approach to punctuation would preserve phrases such as "I'd", "would've", "hadn't" and so forth.*

In [11]:
products = products.fillna({'review':''})  # fill in N/A's in the review column
products['review_clean'] = products['review'].apply(remove_punctuation)
products.head(2)

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...


## Extract Sentiments

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment

In [12]:
products = products[products.rating != 3]

In [13]:
products['sentiment'] = products['rating'].apply(lambda x: +1 if x >=4 else -1)
products[products.rating<4].head(2)

Unnamed: 0,name,review,rating,review_clean,sentiment
21,Nature's Lullabies Second Year Sticker Calendar,I only purchased a second-year calendar for my...,2,I only purchased a secondyear calendar for my ...,-1
41,"SoftPlay Giggle Jiggle Funbook, Happy Bear",This bear is absolutely adorable and I would g...,2,This bear is absolutely adorable and I would g...,-1


## Split into training and test sets

In [14]:
train_idx = pd.read_json('./module-2-assignment-train-idx.json')
test_idx = pd.read_json('./module-2-assignment-test-idx.json')
test_idx.head(5)

Unnamed: 0,0
0,8
1,9
2,14
3,18
4,24


In [15]:
products_train = products.iloc[train_idx[0]]
products_test = products.iloc[test_idx[0]]
products.shape, products_train.shape, products_test.shape

((166752, 5), (133416, 5), (33336, 5))

In [16]:
products_test.head(5) #since we removed rating==3, some of the indices were removed previously ...

Unnamed: 0,name,review,rating,review_clean,sentiment
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1


## Bag of words

We will now compute the word count for each word that appears in the reviews. A vector consisting of word counts is often referred to as bag-of-word features. Since most words occur in only a few reviews, word count vectors are sparse. For this reason, scikit-learn and many other tools use sparse matrices to store a collection of word count vectors

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(products_train['review_clean'].values.astype('U'))
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(products_test['review_clean'].values.astype('U'))

In [18]:
train_matrix.shape #133416 rows of data, 121712 are the features/words

(133416, 121712)

In [19]:
test_matrix.shape

(33336, 121712)

In [20]:
train_matrix.toarray

<bound method csr_matrix.toarray of <133416x121712 sparse matrix of type '<type 'numpy.int64'>'
	with 7326618 stored elements in Compressed Sparse Row format>>

## Train a sentiment classifier with logistic regression

In [36]:
from sklearn import linear_model
sentiment_model = linear_model.LogisticRegression()

In [37]:
sentiment_model.fit(train_matrix, products_train['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
sentiment_model.coef_.shape

(1, 121712)

In [41]:
sentiment_model.intercept_, sentiment_model.coef_

(array([ 1.37712037]),
 array([[ -1.23643972e+00,   2.05963522e-04,   2.59016487e-02, ...,
           1.14053962e-02,   3.20394181e-03,  -7.15396558e-05]]))

In [42]:
#Positive and Negative sentiments
len(sentiment_model.coef_[sentiment_model.coef_>=0]), len(sentiment_model.coef_[sentiment_model.coef_<0])

(85811, 35901)

In [43]:
print sentiment_model.coef_
print sentiment_model.coef_[0]

[[ -1.23643972e+00   2.05963522e-04   2.59016487e-02 ...,   1.14053962e-02
    3.20394181e-03  -7.15396558e-05]]
[ -1.23643972e+00   2.05963522e-04   2.59016487e-02 ...,   1.14053962e-02
   3.20394181e-03  -7.15396558e-05]


In [44]:
#Top 10 Positive words
d = {'Coefficients': sentiment_model.coef_[0]}
sentiment_model_coefs = pd.DataFrame(d).sort_values(by='Coefficients', ascending=True)
sentiment_model_coefs.head(10)

Unnamed: 0,Coefficients
35045,-2.744271
120295,-2.680873
81346,-2.622521
120276,-2.61533
106711,-2.451176
34484,-2.431423
114476,-2.326682
81609,-2.298355
27609,-2.292919
81599,-2.285524


In [45]:
#reverse key/value in vectorizer.vocabulary_
vocab = {v: k for k, v in vectorizer.vocabulary_.iteritems()}
print vocab[35045]

dissapointed


### Top 10 negative words

In [28]:
for key in sentiment_model_coefs.index[0:10]:
    print vocab[key]

dissapointed
worthless
pointless
worst
theory
disappointing
useless
poorly
concept
poor


### Top 10 positive words

In [29]:
for key in sentiment_model_coefs.index[-10:]:
    print key, vocab[key]

61811 lifesaver
76453 outstanding
89901 rich
78982 perfect
91891 saves
81177 ply
36155 downside
40372 excellent
80934 pleasantly
10112 amazed


## Making predictions with logistic regression

### Lets explore this in the context of 3 data points in the test data.

In [30]:
sample_test_data = products_test[10:13]
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [31]:
sample_test_data.iloc[0]['review'] #sounds positive

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [32]:
sample_test_data.iloc[1]['review'] #sounds negative

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [33]:
sample_test_data.iloc[2]['review'] #sounds negative

"Was so excited to get this product for my baby girls bedroom!  When I got it the back is NOT STICKY at all!  Every time I walked into the bedroom I was picking up pieces off of the floor!  Very very frustrating!  Ended up having to super glue it to the wall...very disappointing.  I wouldn't waste the time or money on it."

In [34]:
sentiment_model.predict(test_matrix[10:13,:]) #Dang! It got it right!

array([ 1, -1, -1])

In [35]:
sentiment_model.decision_function(test_matrix[10:13,:]) #outputs scores, greater than 0 is +1, less than 0 is negative

array([  5.60351902,  -3.13388392, -10.40529952])

In [36]:
sentiment_model.predict_proba(test_matrix[10:13,:]) #2nd column matches real probabilities ... 

array([[  3.67134530e-03,   9.96328655e-01],
       [  9.58268986e-01,   4.17310142e-02],
       [  9.99969729e-01,   3.02707158e-05]])

In [37]:
scores = sentiment_model.decision_function(test_matrix[10:13,:])
probabilities = 1/(1+np.exp(-1*scores))
probabilities

array([  9.96328655e-01,   4.17310142e-02,   3.02707158e-05])

## Find the most positive (and negative) review

In [48]:
sentiment_model.predict(test_matrix)

array([1, 1, 1, ..., 1, 1, 1])

In [49]:
predict_prob = sentiment_model.predict_proba(test_matrix)[:,1]
predict_score = sentiment_model.decision_function(test_matrix)

In [50]:
d = {'Probabilties': predict_prob, 'Scores': predict_score, 'Predictions': sentiment_model.predict(test_matrix)}
sentiment_model_predictions = pd.DataFrame(d).sort_values(by='Scores', ascending=False)
sentiment_model_predictions.head(20)

Unnamed: 0,Predictions,Probabilties,Scores
18112,1,1.0,53.480208
15732,1,1.0,51.983377
24286,1,1.0,48.224638
25554,1,1.0,46.451638
24899,1,1.0,44.278855
9125,1,1.0,43.153104
21531,1,1.0,41.884756
32782,1,1.0,40.784127
9555,1,1.0,40.509339
30535,1,1.0,40.381127


In [51]:
sentiment_model_predictions.shape, products_test.shape

((33336, 3), (33336, 5))

In [52]:
sentiment_model_predictions.index[0:10]

Int64Index([18112, 15732, 24286, 25554, 24899, 9125, 21531, 32782, 9555,
            30535],
           dtype='int64')

### Top 10 positive reviews

In [53]:
products_test.iloc[sentiment_model_predictions.index[0:10]]

Unnamed: 0,name,review,rating,review_clean,sentiment
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1
133651,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1
50315,"P'Kolino Silly Soft Seating in Tias, Green",I've purchased both the P'Kolino Little Reader...,4,Ive purchased both the PKolino Little Reader C...,1
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1
180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2 Its e...,1
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1


### Top 10 Negative reviews

In [56]:
products_test.iloc[sentiment_model_predictions.index[-20:]]

Unnamed: 0,name,review,rating,review_clean,sentiment
31741,"Regalo My Cot Portable Bed, Royal Blue",If I could give this product zero stars I woul...,1,If I could give this product zero stars I woul...,-1
83234,"Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs",My Experience: Babykicks Inserts failure vs RA...,5,My Experience Babykicks Inserts failure vs RAV...,1
1116,Safety 1st Deluxe 4-in-1 Bath Station,This item is junk. I originally chose it beca...,1,This item is junk I originally chose it becau...,-1
154878,VTech Communications Safe &amp; Sound Digital ...,"First, the distance on these are no more than ...",1,First the distance on these are no more than 7...,-1
149987,NUK Cook-n-Blend Baby Food Maker,It thought this would be great. I did a lot of...,1,It thought this would be great I did a lot of ...,-1
40079,Chicco Cortina KeyFit 30 Travel System in Adve...,My wife and I have used this system in two car...,1,My wife and I have used this system in two car...,-1
75994,"Peg-Perego Tatamia High Chair, White Latte",I can see why there are so many good reviews o...,2,I can see why there are so many good reviews o...,-1
172090,Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...,I read so many reviews saying the Belkin WiFi ...,2,I read so many reviews saying the Belkin WiFi ...,-1
59546,Ellaroo Mei Tai Baby Carrier - Hershey,This is basically an overpriced piece of fabri...,1,This is basically an overpriced piece of fabri...,-1
9915,Cosco Alpha Omega Elite Convertible Car Seat,I bought this car seat after both seeing the ...,1,I bought this car seat after both seeing the ...,-1


Looks like one of the reviews is wrongly classified above!

## Compute Accuracy, Precision, Recall

In [57]:
prediction = sentiment_model.predict(test_matrix)

In [58]:
from sklearn.metrics import accuracy_score
accuracy_score(products_test['sentiment'], prediction)

0.93226541876649871

#### Confusion matrix
*By definition a confusion matrix C is such that C_{i, j} is equal to the number of observations known to be in group i but predicted to be in group j.
Thus in binary classification, the count of true negatives is C_{0,0}, false negatives is C_{1,0}, true positives is C_{1,1} and false positives is C_{0,1}.*

In [59]:
from sklearn.metrics import confusion_matrix
confusion_matrix(products_test['sentiment'], prediction)

array([[ 3788,  1453],
       [  805, 27290]])

In [60]:
import sklearn.metrics
sklearn.metrics.precision_score(products_test['sentiment'], prediction)

0.94944856138885991

In [61]:
sklearn.metrics.recall_score(products_test['sentiment'], prediction)

0.97134721480690511

In [62]:
sentiment_model.C

1.0

## Learn another classifier with fewer words

In [None]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
significant_words

In [None]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(products_train['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(products_test['review_clean'])

## Train a logistic regression model on a subset of data

In [None]:
simple_model = linear_model.LogisticRegression()

In [None]:
simple_model.fit(train_matrix_word_subset, products_train['sentiment'])

In [None]:
d = {'Coefficients': simple_model.coef_[0]}
simple_model_coefs = pd.DataFrame(d)
simple_model_coefs

In [None]:
vocab = {v:k for k,v in vectorizer_word_subset.vocabulary_.iteritems()}
vocab
simple_model_coefs['significant_word'] = vocab.values()
simple_model_coefs.sort_values(by='Coefficients', ascending=False)

## Calculate accuracy and other metrics

In [None]:
import sklearn.metrics
prediction = simple_model.predict(test_matrix_word_subset)

In [None]:
sklearn.metrics.accuracy_score(products_test['sentiment'], prediction)

In [None]:
sklearn.metrics.confusion_matrix(products_test['sentiment'], prediction)

In [None]:
sklearn.metrics.precision_score(products_test['sentiment'], prediction)

In [None]:
sklearn.metrics.recall_score(products_test['sentiment'], prediction)

## Baseline: Majority class prediction

In [None]:
num_positive  = (products_test['sentiment'] == +1).sum()
num_negative = (products_test['sentiment'] == -1).sum()
print num_positive
print num_negative

In [None]:
accuracy = num_positive/float(num_positive+num_negative) #since we predict all datapoints as positive since it is majority
accuracy

## Logistic Regression with Cross Validation

In [1]:
from sklearn import linear_model
sentiment_model_cv = linear_model.LogisticRegressionCV()

In [21]:
sentiment_model_cv.fit(train_matrix, products_train['sentiment'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [22]:
sentiment_model_cv.C_ #this is the chosen C

array([ 0.35938137])

In [23]:
sentiment_model_cv.Cs_

array([  1.00000000e-04,   7.74263683e-04,   5.99484250e-03,
         4.64158883e-02,   3.59381366e-01,   2.78255940e+00,
         2.15443469e+01,   1.66810054e+02,   1.29154967e+03,
         1.00000000e+04])

In [26]:
sentiment_model_cv.coef_.shape

(1, 121712)

In [31]:
prediction = sentiment_model_cv.predict(test_matrix)

In [29]:
import sklearn.metrics

In [32]:
sklearn.metrics.accuracy_score(products_test['sentiment'], prediction)

0.93316534677225826

In [34]:
sklearn.metrics.precision_score(products_test['sentiment'], prediction)

0.94828602128175798

In [35]:
sklearn.metrics.recall_score(products_test['sentiment'], prediction)

0.97380316782345611

Accuracy and Precision went slightly down with L2 Regularization. Recall went up by a tiny bit.