In [None]:
import sframe
products = sframe.SFrame('amazon_baby.gl/')

In [3]:
# performing text cleaning for punctuation characters
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

products['review_clean'] = products['review'].apply(remove_punctuation)

In [7]:
products[0:2]

name,review,rating,review_clean
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0,These flannel wipes are OK but in my opinion not ...
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,it came early and was not disappointed i love ...


In [11]:
products = products.fillna('review','')  # fill in N/A's in the review column

In [12]:
# ignore all neutral reviews, ie with sentiment 3
products = products[products['rating'] != 3]

In [27]:
# making reviews categorical with labels +1, -1
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

products[products['rating'] < 3][0:1]

name,review,rating,review_clean,sentiment
Nature's Lullabies Second Year Sticker Calendar ...,I only purchased a second-year calendar for ...,2.0,I only purchased a secondyear calendar for ...,-1


In [28]:
products[products['rating'] > 3][0:1]

name,review,rating,review_clean,sentiment
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,it came early and was not disappointed i love ...,1


In [29]:
# splitting into training and testing data
train_data, test_data = products.random_split(.8, seed=1)

In [30]:
# build word count vector for each reviews
from sklearn.feature_extraction.text import CountVectorizer

# Use this token pattern to keep single-letter words
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

# First, learn vocabulary from the training data and assign columns to words
train_matrix = vectorizer.fit_transform(train_data['review_clean'])

# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [34]:
train_matrix.shape

(133416, 121712)

In [35]:
# train a sentiment classifier with logistic regression
from sklearn.linear_model import LogisticRegression

sentiment_model = LogisticRegression()

sentiment_model.fit(train_matrix, train_data['sentiment'])


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
# number of weights > 0
sum(x>=0 for x in sentiment_model.coef_[0])

85810

In [45]:
# making predictions with the learned model

In [49]:
sample_test_data = test_data[10:13]
print sample_test_data[0]['review_clean']

Absolutely love it and all of the Scripture in it  I purchased the Baby Boy version for my grandson when he was born and my daughterinlaw was thrilled to receive the same book again


In [50]:
print sample_test_data[1]['review_clean']

Would not purchase again or recommend The decals were thick almost plastic like and were coming off the wall as I was applying them The would NOT stick Literally stayed stuck for about 5 minutes then started peeling off


In [51]:
print sample_test_data[2]['review_clean']

Was so excited to get this product for my baby girls bedroom  When I got it the back is NOT STICKY at all  Every time I walked into the bedroom I was picking up pieces off of the floor  Very very frustrating  Ended up having to super glue it to the wallvery disappointing  I wouldnt waste the time or money on it


In [52]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores

[  5.60283428  -3.14916172 -10.41701265]


In [54]:
# predicting on a negative review
sentiment_model.predict(sample_test_matrix[1])

array([-1], dtype=int64)

In [60]:
import numpy as np
def calculate_prob(scores):
    return 1.0/(1+np.exp(-1.0*scores))

In [61]:
scores = np.array(sentiment_model.decision_function(sample_test_matrix))
scores = calculate_prob(scores)
scores

array([  9.96326149e-01,   4.11243216e-02,   2.99182300e-05])

In [62]:
# 3rd one is the most negative review