# Analyzing product sentiment

In [1]:
import pandas as pd
import sklearn
import numpy as np

In [3]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 
                  'awful', 'wow', 'hate']
print selected_words

['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']


In [97]:
products = pd.read_csv('amazon_baby.csv')
products = products[products['rating'] != 3]
# positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'].apply(lambda x: 1 if x>=4 else 0)
print products.shape
products.head(2)

(166752, 4)


Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1


In [98]:
products.iat[0,1]

'it came early and was not disappointed. i love planet wise bags and now my wipe holder. it keps my osocozy wipes moist and does not leak. highly recommend it.'

In [99]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=selected_words, lowercase=False)
#corpus = ['This is the first document.','This is the second second document.']
#print corpus
selected_word_count = vectorizer.fit_transform(products['review'].values.astype('U'))
vectorizer.get_feature_names()

['awesome',
 'great',
 'fantastic',
 'amazing',
 'love',
 'horrible',
 'bad',
 'terrible',
 'awful',
 'wow',
 'hate']

In [105]:
word_count_array = selected_word_count.toarray()
print word_count_array
print word_count_array.shape

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 2 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
(166752, 11)


### 1. Out of the 11 words in selected_words, which one is most used in the reviews in the dataset? great
### 2. Out of the 11 words in selected_words, which one is least used in the reviews in the dataset? wow

In [101]:
word_count_array.sum(axis=0)

array([ 3189, 46903,  1468,  2207, 33208,   956,  4010,   998,   589,
         109,   984])

### Running regression model

#### Split data into train and test set

In [108]:
output = products['sentiment']
output

[1 1 1 ..., 1 1 1]


In [112]:
import sklearn
#from sklearn.model_selection import train_test_split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(word_count_array, output, test_size=0.2, random_state=5)
print X_train[0,:]
print X_test[0,:]
print y_train[0:5]

[0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 1]


#### Run logistic regression

In [119]:
from sklearn import linear_model
logistic = linear_model.LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [122]:
# The coefficients
print('Coefficients: \n', logistic.coef_)
print logistic.intercept_

('Coefficients: \n', array([[ 0.99844491,  0.7671545 ,  0.7877985 ,  0.95887133,  1.30626623,
        -2.18334046, -0.92386687, -1.97783683, -1.87907151, -0.63992155,
        -1.31852361]]))
[ 1.41822074]
0


### 3. Out of the 11 words in selected_words, which one got the most positive weight in the selected_words_model? *love*
### 4. Out of the 11 words in selected_words, which one got the most negative weight in the selected_words_model? *horrible*

### Apply the model on test data and get accuracy

In [128]:
prediction = logistic.predict(X_test)
print prediction

[1 1 1 ..., 1 1 1]


In [129]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, prediction)

0.84567179394920688

#### Confusion matrix
*By definition a confusion matrix C is such that C_{i, j} is equal to the number of observations known to be in group i but predicted to be in group j.
Thus in binary classification, the count of true negatives is C_{0,0}, false negatives is C_{1,0}, true positives is C_{1,1} and false positives is C_{0,1}.*

In [130]:
sklearn.metrics.confusion_matrix(y_test, prediction)

array([[  299,  5045],
       [  102, 27905]])

True ngatives = 299
False negatives = 102
False positive = 5045
True positive = 27905

#### Precision and Recall

In [133]:
sklearn.metrics.precision_score(y_test, prediction)

0.84688922610015172

In [134]:
sklearn.metrics.recall_score(y_test, prediction)

0.99635805334380689