In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
products = pd.read_csv('amazon_baby.csv')

Preprocess of table

In [3]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

remove punctuations from reviews

In [4]:
def remove_punctuation(text):
    # define punctuation
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~+='''

    # remove punctuation from the string
    no_punct = ""
    for char in text:
        if char not in punctuations:
            no_punct = no_punct + char
        else:
            no_punct = no_punct + " "
            
    return no_punct
 

In [5]:
index=0
review_without_puctuation = pd.Series([])
for i in products['review']:
    review_without_puctuation[index] = remove_punctuation(i)
    index = index + 1

products.insert(products.shape[1], "review_clean", review_without_puctuation) 

In [6]:
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [7]:
products

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I ...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried non stop when I trie...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we did...,1
6,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4,Lovely book it s bound tightly so you may not...,1
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,Perfect for new parents We were able to keep ...,1
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,A friend of mine pinned this product on Pinter...,1
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1


In [10]:
#products.to_csv("products.csv", index = False, header = True)      #save to file to use later

In [2]:
#products = pd.read_csv('products.csv')      #load the saved db

In [89]:
with open('module-2-assignment-test-idx.json') as test_data_file:    
    test_data_idx = json.load(test_data_file)
    
with open('module-2-assignment-train-idx.json') as train_data_file:    
    train_data_idx = json.load(train_data_file)
    
train_data = products.iloc[train_data_idx]
test_data = products.iloc[test_data_idx]

In [64]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'].values.astype('U'))
test_matrix = vectorizer.transform(test_data['review_clean'].values.astype('U'))

In [66]:
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'].values.astype('U'))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [67]:
print ("So the number of positive coefficients are: ",np.sum(sentiment_model.coef_ >= 0))

So the number of positive coefficients are:  39458


In [68]:
sample_test_data = test_data[10:13]
print (sample_test_data)

                                                 name  \
53                          Our Baby Girl Memory Book   
64  Wall Decor Removable Decal Sticker - Colorful ...   
82  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
53  Absolutely love it and all of the Scripture in...       5   
64  Would not purchase again or recommend. The dec...       2   
82  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
53  Absolutely love it and all of the Scripture in...          1  
64  Would not purchase again or recommend  The dec...         -1  
82  Was so excited to get this product for my baby...         -1  


In [69]:
print ("1. ",sample_test_data.iloc[0]['review'],"\n")
print ("2. ",sample_test_data.iloc[1]['review'],"\n")
print ("3. ",sample_test_data.iloc[2]['review'],"\n")

1.  Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again. 

2.  Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off. 

3.  Was so excited to get this product for my baby girls bedroom!  When I got it the back is NOT STICKY at all!  Every time I walked into the bedroom I was picking up pieces off of the floor!  Very very frustrating!  Ended up having to super glue it to the wall...very disappointing.  I wouldn't waste the time or money on it. 



In [70]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)

print ("The scores for 1st, 2nd and 3rd reviews are as follow: ", scores)
print ("So they will classify as: ", sentiment_model.predict(sample_test_matrix))

The scores for 1st, 2nd and 3rd reviews are as follow:  [  4.97330872  -3.03750274 -10.70228509]
So they will classify as:  ['1' '-1' '-1']


In [71]:
probabilities = [1./(1+np.exp(-x)) for x in scores]
print ("probabilities :", probabilities)

probabilities : [0.9931273471331543, 0.045760092589294234, 2.2492973646374006e-05]


Of the three data points in sample_test_data, the third point has the lowest probability of being classified as a positive review.

In [72]:
test_scores = sentiment_model.decision_function(test_matrix)
positive_idx = np.argsort(-test_scores)[:20]

print ("20 most positive reviews ids: \n", positive_idx)
print ("\nThe most positive reviews id is :{} that its score is about {}".format(positive_idx[0], test_scores[positive_idx[0]]))

test_data.iloc[positive_idx]["review"]

20 most positive reviews ids: 
 [15732 25554 24286 18112  9555 17558 21531 30634 24899 14482  9125 26830
 33060  4140 16502 11923 32782 30076 27048 21203]

The most positive reviews id is :15732 that its score is about 50.11438011201274


79040     I am so HAPPY I brought this item for my 7 mon...
127998    I bought this seat for my tall (38in) and thin...
121469    [I got this stroller for my daughter prior to ...
90999     I bought this carrier when my daughter was abo...
47873     After seeing this in Parent's Magazine and rea...
88410     I absolutely love this product.  I work as a C...
108269    Great Pram Rocco!!!!!!I bought this pram from ...
153287    Graco's FastAction Jogging Stroller definitely...
124541    My husband and I assembled this Pack n' Play l...
72823     I just tried this hands free breastpump bra, a...
45751     I've purchased both the P'Kolino Little Reader...
134472    Amazing, Love, Love, Love it !!! All 5 STARS a...
165422    I love this baby monitor.  I can compare this ...
20612     I researched a few different seats to put in o...
83090     I recently bought this to replace a Chicco Cor...
60039     It's always fun to write a review on those pro...
164117    After much research I purchase

In [73]:
test_scores = sentiment_model.decision_function(test_matrix)
positive_idx = np.argsort(test_scores)[:20]

print ("20 most negative reviews ids: \n", positive_idx)
print ("\nThe most negative reviews id is :{} that its score is about {}".format(positive_idx[0], test_scores[positive_idx[0]]))

test_data.iloc[positive_idx]["review"]

20 most negative reviews ids: 
 [21700 28184  2931 17069  8818 13939  1810  9655  1942 20594 13751 14711
 15062 31928 30373 27231   205 17222 10814 11986]

The most negative reviews id is :21700 that its score is about -34.34005704515967


109216    This is the first review I have ever written o...
141135    This is my second video monitoring system, the...
14637     We have not had ANY luck with Fisher-Price pro...
85925     Note: we never installed batteries in these un...
44284     I will try to write an objective review of the...
70004     I thought it sounded great to have different t...
9072      I bought this car seat after both seeing  the ...
48396     We bought this baby monitor to replace a diffe...
9768      It's 3am in the morning and needless to say, t...
103564    DO NOT BUY THIS BABY MONITOR!I purchased this ...
69021     I can see why there are so many good reviews o...
73893     I bought this sprayer out of desperation durin...
75624     My Experience: Babykicks Inserts failure vs RA...
159929    I really wanted to love this seat; however, I ...
151972    Reviewers. You failed me!This thing worked for...
136305    It thought this would be great. I did a lot of...
1009      This item is junk.  I original

In [79]:
predicted_y = sentiment_model.predict(test_matrix)

index = 0 
correct_num = 0 
for i in test_data['sentiment']:
    if(int(predicted_y[index]) == i):
        correct_num = correct_num + 1
        
total_num = len(test_data['sentiment'])
print ("correct pedictions: {}, total number of sentiments: {}".format(correct_num, total_num))

accuracy = correct_num * 1./ total_num
print ("\nAccuracy of the sentiment_model on the test_data is : {0:.2f}".format(accuracy))

correct pedictions: 28095, total number of sentiments: 33336

Accuracy of the sentiment_model on the test_data is : 0.84


In [80]:
significant_words = ['love', 'great'  , 'easy' , 'old'    , 'little', 'perfect', 'loves', 
                     'well', 'able'   , 'car'  , 'broke'  , 'less'  , 'even'   , 'waste',
                     'disappointed'   , 'work' , 'product', 'money' , 'would'  , 'return']

In [81]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'].values.astype('U'))
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'].values.astype('U'))

In [82]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [83]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

simple_model_coef_table.sort_values(['coefficient'], ascending=False)

Unnamed: 0,coefficient,word
6,1.67763,loves
5,1.506878,perfect
0,1.356421,love
2,1.183238,easy
1,0.943508,great
7,0.530188,well
4,0.514632,little
8,0.193079,able
3,0.082776,old
9,0.056467,car


In [86]:
print ("Coefficients which are positive for the simple_model are",len(simple_model_coef_table[simple_model_coef_table['coefficient']>0]))

Coefficients which are positive for the simple_model are 10


In [100]:
train_predicted_y = sentiment_model.predict(train_matrix)

index = 0 
correct_num = 0
for i in train_data['sentiment']:
    if(int(train_predicted_y[index]) == i):
        correct_num += 1
    index += 1
      
total_num = len(train_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
train_accuracy = correct_num * 1./ total_num
print ("sentiment_model training accuracy: {}".format(train_accuracy))

train_predicted_y = simple_model.predict(train_matrix_word_subset)

correct_num = np.sum(train_predicted_y == train_data['sentiment'])
total_num = len(train_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
train_accuracy = correct_num * 1./ total_num
print ("simple_model training accuracy: {}".format(train_accuracy))

correct_num: 128511, total_num: 133416
sentiment_model training accuracy: 0.9632352941176471
correct_num: 115701, total_num: 133416
simple_model training accuracy: 0.8672198237093003


So sentiment_model has higher accuracy on the TRAINING set

In [99]:
test_predicted_y = sentiment_model.predict(test_matrix)

index = 0 
correct_num = 0
for i in test_data['sentiment']:
    if(int(test_predicted_y[index]) == i):
        correct_num += 1
    index += 1
        
total_num = len(test_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
test_accuracy = correct_num * 1./ total_num
print ("sentiment_model test accuracy: {}".format(test_accuracy))

test_predicted_y = simple_model.predict(test_matrix_word_subset)
correct_num = np.sum(test_predicted_y == test_data['sentiment'])
total_num = len(test_data['sentiment'])
print ("correct_num: {}, total_num: {}".format(correct_num, total_num))
test_accuracy = correct_num * 1./ total_num
print ("simple_model test accuracy: {}".format(test_accuracy))

correct_num: 31084, total_num: 33336
sentiment_model test accuracy: 0.9324454043676506
correct_num: 29009, total_num: 33336
simple_model test accuracy: 0.8702003839692825


And also sentiment_model has higher accuracy on the TEST set

In [114]:
baseline_accuracy = len(test_data[test_data['sentiment']>0]) / (len(test_data[test_data['sentiment']>0]) + len(test_data[test_data['sentiment']<0]))
print ("The accuracy of the majority class classifier model on the test_data is {0:.2f}".format(baseline_accuracy))

The accuracy of the majority class classifier model on the test_data is 0.84


The sentiment_model definitely is better than the majority class classifier (the baseline) due to its higher accuracy