In [1]:
import pandas as pd
from matplotlib.pyplot import plot as plt
import numpy as np

# Read some product review data

In [2]:
products=pd.read_csv('data_set/amazon_baby.csv')
products.columns

Index(['name', 'review', 'rating'], dtype='object')

# Building a sentiment classifier

In [3]:
print(products.groupby('rating').size())

rating
1     15183
2     11310
3     16779
4     33205
5    107054
dtype: int64


## Define what's a positive and a negative sentiment

In [4]:
#ignore 3* rating
products=products[products['rating']!=3]

In [5]:
#add a sentiment column with 4* and 5* rating
products['sentiment'] = products['rating']>=4 

In [6]:
products['sentiment']=products['sentiment'].replace(to_replace=True,value=int(1))
products

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1.0
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1.0
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1.0
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1.0
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1.0
6,A Tale of Baby\'s Days with Peter Rabbit,"Lovely book, it\'s bound tightly so you may no...",4,1.0
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,1.0
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,1.0
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,1.0
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,1.0


# Building the word count vector for each review

## Tokenizing text with `scikit-learn`

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
X_products_wc=count_vect.fit_transform(products['review'].values.astype(str))

In [8]:
products.columns    # use of df[].values.astype('U') as countvectorizer accepts only str or file or unicode data

Index(['name', 'review', 'rating', 'sentiment'], dtype='object')

In [9]:
x_counts=X_products_wc
x_count=np.sum(x_counts,axis=0)
x_count.item((0,6500)),count_vect.get_feature_names()[6500]

(3892, 'awesome')

# Exploring the most commom product

In [10]:
products_common=products.groupby('name').review.size().sort_values(ascending = False)

In [11]:
products_common

name
Vulli Sophie the Giraffe Teether                                               723
Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L                            547
Baby Einstein Take Along Tunes                                                 526
Infant Optics DXR-5 2.4 GHz Digital Video Baby Monitor with Night Vision       519
Cloud b Twilight Constellation Night Light, Turtle                             490
Fisher-Price Booster Seat, Blue/Green/Gray                                     473
Fisher-Price Rainforest Jumperoo                                               437
Graco Nautilus 3-in-1 Car Seat, Matrix                                         386
Leachco Snoogle Total Body Pillow                                              366
Regalo Easy Step Walk Thru Gate, White                                         333
Baby Trend Diaper Champ                                                        298
Skip Hop Zoo Pack Little Kid Backpack, Dog                                     276

In [12]:
giraffe_review=products[products['name']=='Vulli Sophie the Giraffe Teether']
giraffe_review_rate=giraffe_review.groupby('rating').size().sort_index(ascending=False)
giraffe_review_rate,giraffe_review

(rating
 5    535
 4     95
 2     37
 1     56
 dtype: int64,                                     name  \
 34313   Vulli Sophie the Giraffe Teether   
 34314   Vulli Sophie the Giraffe Teether   
 34315   Vulli Sophie the Giraffe Teether   
 34316   Vulli Sophie the Giraffe Teether   
 34317   Vulli Sophie the Giraffe Teether   
 34318   Vulli Sophie the Giraffe Teether   
 34319   Vulli Sophie the Giraffe Teether   
 34320   Vulli Sophie the Giraffe Teether   
 34321   Vulli Sophie the Giraffe Teether   
 34323   Vulli Sophie the Giraffe Teether   
 34324   Vulli Sophie the Giraffe Teether   
 34325   Vulli Sophie the Giraffe Teether   
 34326   Vulli Sophie the Giraffe Teether   
 34327   Vulli Sophie the Giraffe Teether   
 34328   Vulli Sophie the Giraffe Teether   
 34329   Vulli Sophie the Giraffe Teether   
 34330   Vulli Sophie the Giraffe Teether   
 34332   Vulli Sophie the Giraffe Teether   
 34333   Vulli Sophie the Giraffe Teether   
 34334   Vulli Sophie the Giraffe Teet

# Training a classifier

In [13]:
Y_products = products['sentiment']

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X_products_wc,Y_products,test_size=0.2,train_size=0.8,random_state=42)

In [14]:
from sklearn.linear_model import LogisticRegression
sentiment_model=LogisticRegression()
sentiment_model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Evaluating the model

In [15]:
y_pred=sentiment_model.predict(x_test)
confidence_val=sentiment_model.predict_proba(x_test)
from sklearn.metrics import confusion_matrix,accuracy_score
tn,fp,fn,tp=confusion_matrix(y_test,y_pred).ravel()
accuracy=accuracy_score(y_test,y_pred)
tn,fp,fn,tp,accuracy

(3875, 1399, 843, 27234, 0.9327756289166742)

# Applying the learned model to understand sentiment for the Giraffe

In [16]:
giraffe_review_data=count_vect.transform(giraffe_review['review'].values.astype(str))
giraffe_pred=sentiment_model.predict(giraffe_review_data)
giraffe_review['predict_sentiment']=giraffe_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
giraffe_review.tail(5).sort_values(by='predict_sentiment',ascending=False)

Unnamed: 0,name,review,rating,sentiment,predict_sentiment
159649,Vulli Sophie the Giraffe Teether,My baby loves her Sophie Chew Toy. She can che...,5,1.0,1.0
159650,Vulli Sophie the Giraffe Teether,Sophie the Giraffe was a big hit at the baby s...,5,1.0,1.0
159651,Vulli Sophie the Giraffe Teether,quick shipping and perfect product. I would pu...,5,1.0,1.0
159652,Vulli Sophie the Giraffe Teether,My baby who is currently teething love his Sop...,5,1.0,1.0
159653,Vulli Sophie the Giraffe Teether,I know several baits that are enjoying the sof...,5,1.0,1.0


# Assignment

In [18]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [19]:
for word in selected_words:
    print(word,count_vect.vocabulary_.get(word))

awesome 6500
great 25994
fantastic 22274
amazing 4672
love 33879
horrible 28143
bad 7067
terrible 56272
awful 6509
wow 63486
hate 27054


# Building feature word vector based on selected words

In [35]:
assign_vect = CountVectorizer(token_pattern=r'awesome|great|fantastic|amazing|love|horrible|bad|terrible|awful|wow|hate')
x_assign_data=assign_vect.fit_transform(products['review'].values.astype(str))
x_assign_data.shape

(166752, 11)

## Counting the entries for each word

In [36]:
assign_vect.get_feature_names()

['amazing',
 'awesome',
 'awful',
 'bad',
 'fantastic',
 'great',
 'hate',
 'horrible',
 'love',
 'terrible',
 'wow']

In [39]:
x_data_array=x_assign_data.toarray()

In [126]:
x_data_array.sum(axis=0)

array([ 2890,  3916,   726,  4523,  1716, 56522,  3644,  1112, 73428,
        1147,   435], dtype=int64)

# New sentiment model based on selected_words

In [47]:
x_train_a,x_test_a,y_train_a,y_test_a=train_test_split(x_assign_data,Y_products,train_size=0.8,test_size=0.2,random_state=42)

In [48]:
selected_model = LogisticRegression()
selected_model.fit(x_train_a,y_train_a)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Coefficients/weights for each feature

In [52]:
selected_model.coef_,assign_vect.get_feature_names()

(array([[ 1.06988849,  1.1077068 , -1.96539839, -1.0116976 ,  0.95896157,
          0.85713743, -0.77760037, -2.073764  ,  1.23847433, -2.22990885,
         -0.06498879]]),
 ['amazing',
  'awesome',
  'awful',
  'bad',
  'fantastic',
  'great',
  'hate',
  'horrible',
  'love',
  'terrible',
  'wow'])

 # Evaluate the accuracy of the model

In [53]:
y_assign_pred=selected_model.predict(x_test_a)

In [54]:
accuracy_assign=accuracy_score(y_test_a,y_assign_pred)
accuracy_assign

0.8490599982009535

## Interpreting the difference between the models

In [57]:
diaper_champ_reviews=products[products['name']=='Baby Trend Diaper Champ'].review.values

In [58]:
diaper_champ_reviews

array(["Ok - newsflash.  Diapers are just smelly.  We\\'ve had this pail for 2.5 years now.  It was our first and primary one.  There were no major smell problems until after one year, when our son started eating solids.  Also, we change the bag twice weekly as 3 days is about the max for smell-containment.  Around 20-22 months we started shopping for a container that would be less smelly and didn\\'t find one as good.  (We have a cheaper one upstairs which broke immediately and always stunk!)  We finally just put the Diaper Champ in the attic a few months ago and use the cheap one with the flip-up lid - mainly since the cheapo fits inside the cabinet and we didn\\'t notice a big difference in smell-control.  (The most helpful action is to tie the dirty diapers inside a small plastic bag before putting them in the pail.)A couple of our friends have this pail and were pleased until the children started eating solid food and things got stinkier - but that\\'s pretty much the consensus ac

In [60]:
baby_products.head(4)

Unnamed: 0,name,review,rating,sentiment
312,Baby Trend Diaper Champ,Ok - newsflash. Diapers are just smelly. We\...,4,1.0
314,Baby Trend Diaper Champ,"My husband and I selected the Diaper ""Champ"" m...",1,0.0
315,Baby Trend Diaper Champ,Excellent diaper disposal unit. I used it in ...,5,1.0
316,Baby Trend Diaper Champ,We love our diaper champ. It is very easy to ...,5,1.0


## Predicting the sentiment for the reviews on this data

In [100]:
diaper_test_select=assign_vect.transform(diaper_champ_reviews)
diaper_test_sent=count_vect.transform(diaper_champ_reviews)

In [101]:
y_predict_diaper_sent=sentiment_model.predict(diaper_test_sent)
y_predict_diaper_select=selected_model.predict(diaper_test_select)

In [103]:
y_predict_diaper_sent=sentiment_model.predict_proba(diaper_test_sent)

In [106]:
y_predict_diaper_sent

array([[1.19172234e-01, 8.80827766e-01],
       [9.25214021e-01, 7.47859786e-02],
       [3.02569378e-04, 9.99697431e-01],
       [3.51639822e-03, 9.96483602e-01],
       [2.54171098e-02, 9.74582890e-01],
       [1.09166985e-03, 9.98908330e-01],
       [9.14299070e-01, 8.57009299e-02],
       [8.20925328e-10, 9.99999999e-01],
       [2.31430259e-02, 9.76856974e-01],
       [7.72832974e-04, 9.99227167e-01],
       [9.59076228e-01, 4.09237721e-02],
       [7.32199770e-02, 9.26780023e-01],
       [8.34670190e-01, 1.65329810e-01],
       [7.74101814e-07, 9.99999226e-01],
       [3.88891227e-05, 9.99961111e-01],
       [6.14710511e-06, 9.99993853e-01],
       [1.06514359e-01, 8.93485641e-01],
       [8.20595265e-01, 1.79404735e-01],
       [1.83521128e-01, 8.16478872e-01],
       [6.89162767e-05, 9.99931084e-01],
       [1.69922232e-01, 8.30077768e-01],
       [4.88640914e-03, 9.95113591e-01],
       [1.52967360e-06, 9.99998470e-01],
       [4.74589498e-01, 5.25410502e-01],
       [1.530306

In [107]:
type(y_predict_diaper_sent)

numpy.ndarray

In [108]:
np.argmax(y_predict_diaper_sent[:,1])

58

In [109]:
y_predict_diaper_sent[58]

array([8.43987102e-11, 1.00000000e+00])

In [110]:
diaper_champ_reviews[58]

"This is absolutely, by far, the best diaper pail money can buy.  Never do we detect a diaper odor (and my husband has a very sensitive sense of smell and is usually very quick to complain about such things).  For those who say they have a problem with the Diaper Champ getting stuck...the ONLY time this ever happens to us is when the bag is full and needs to be changed.  We love that it uses regular kitchen trash bags, makes it much more economical.  We have not found that we need to worry about frequent emptying or cleaning.  We just leave the Champ to do its job until the mechanism begins to feel like it\\'s getting stuck...then we change the bag.  For us this means about once a week.  Not only is the Champ EASY to use, it\\'s kind of fun.  Before our daughter was born we really worried about whether the diaper pail we chose would be effective enough for us because my husband is so sensitive to smells.  But she\\'s two months old now and we still just can\\'t say enough good things a

In [112]:
y_predict_diaper_select=selected_model.predict_proba(diaper_test_select)

In [113]:
y_predict_diaper_select[58]

array([0.02464053, 0.97535947])

## Majority class classifier

In [115]:
products.shape

(166752, 4)

In [121]:
products.groupby('sentiment').size()

sentiment
0.0     26493
1.0    140259
dtype: int64

In [124]:
accuracy_majority=140259/166752
accuracy_majority

0.8411233448474381