In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

amazon_df = pd.read_csv('amazon_baby.csv')
amazon_df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [2]:
def cleanNaN(value):
    if pd.isnull(value):
        return ""
    else:
        return value

amazon_df['review'] = amazon_df['review'].apply(cleanNaN)
amazon_df['review'][30:50]

30    Beautiful little book.  A great little short s...
31    This book is so worth the money. It says 9+ mo...
32    we just got this book for our one-year-old and...
33    The book is colorful and is perfect for 6month...
34    The book is cute, and we are huge fans of Lama...
35    What a great book for babies!  I\'d been looki...
36    My son loved this book as an infant.  It was p...
37    Our baby loves this book & has loved it for a ...
38                                                     
39    My son likes brushing elmo\'s teeth. Almost to...
40    This was a birthday present for my 2 year old ...
41    This bear is absolutely adorable and I would g...
42    My baby absolutely loves Elmo and so this book...
43    I bought two for recent baby showers!  The boo...
44    We wanted to get another book like the Big Bir...
45    This is a cute little peek-a-boo story book.  ...
46    My 3 month old son loves this book. We read it...
47    Very cute interactive book! My son loves t

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

vect = CountVectorizer(vocabulary=selected_words)
features = vect.fit_transform(amazon_df['review'])

In [4]:
word_frequency = [sum(x) for x in zip(*features.toarray())]
word_count_dict = {word:freq for word, freq in zip(selected_words, word_frequency)}
word_count_dict

{'amazing': 2726,
 'awesome': 4075,
 'awful': 753,
 'bad': 4950,
 'fantastic': 1765,
 'great': 59536,
 'hate': 1285,
 'horrible': 1245,
 'love': 43867,
 'terrible': 1282,
 'wow': 461}

In [5]:
import operator

word_count_dict_sorted = sorted(word_count_dict.items(),key=operator.itemgetter(1),reverse=True)

## Question 1

Out of the 11 words in selected_words, which one is most used in the reviews in the dataset?

In [6]:
word_count_dict_sorted[0]

('great', 59536)

## Question 2

Out of the 11 words in selected_words, which one is least used in the reviews in the dataset?

In [7]:
word_count_dict_sorted[-1]

('wow', 461)

## Question 3

Out of the 11 words in selected_words, which one got the most positive weight in the selected_words_model? (Tip: when printing the list of coefficients, make sure to use print_rows(rows=12) to print ALL coefficients.)

In [8]:
#ignore all 3* reviews
amazon_df = amazon_df[amazon_df['rating'] != 3]

amazon_df['sentiment'] = amazon_df['rating'] >= 4
amazon_df.head()

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,True
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,True
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,True
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,True
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,True


In [9]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(amazon_df['review'], amazon_df['sentiment'], test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(133401,)
(33351,)


In [10]:
# learn training data vocabulary, then create document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm

<133401x11 sparse matrix of type '<class 'numpy.int64'>'
	with 75836 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.linear_model import LogisticRegression
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train_dtm, y_train)
coefs = log_reg_model.coef_

word_weight = {word:weight for word, weight in zip(selected_words, coefs[0])}
word_weight

{'amazing': 1.0524892380956241,
 'awesome': 1.1135398768836622,
 'awful': -2.0296938674660914,
 'bad': -1.0043273875256358,
 'fantastic': 0.96018765173885146,
 'great': 0.86380225545203271,
 'hate': -1.4478419303507317,
 'horrible': -2.0875720193941434,
 'love': 1.3903963481964721,
 'terrible': -2.2388671752661335,
 'wow': -0.11816722488650841}

In [12]:
import operator

word_weight_sorted = sorted(word_weight.items(),key=operator.itemgetter(1),reverse=True)
word_weight_sorted

[('love', 1.3903963481964721),
 ('awesome', 1.1135398768836622),
 ('amazing', 1.0524892380956241),
 ('fantastic', 0.96018765173885146),
 ('great', 0.86380225545203271),
 ('wow', -0.11816722488650841),
 ('bad', -1.0043273875256358),
 ('hate', -1.4478419303507317),
 ('awful', -2.0296938674660914),
 ('horrible', -2.0875720193941434),
 ('terrible', -2.2388671752661335)]

In [13]:
word_weight_sorted[0]

('love', 1.3903963481964721)

## Question 4

Out of the 11 words in selected_words, which one got the most negative weight in the selected_words_model? (Tip: when printing the list of coefficients, make sure to use print_rows(rows=12) to print ALL coefficients.)

In [14]:
word_weight_sorted[-1]

('terrible', -2.2388671752661335)

## Question 5
Which of the following ranges contains the accuracy of the selected_words_model on the test_data?

In [15]:
X_test_dtm = vect.transform(X_test)

# class predictions and predicted probabilities
y_pred_class = log_reg_model.predict(X_test_dtm)
y_pred_prob = log_reg_model.predict_proba(X_test_dtm)[:, 1]

In [16]:
# calculate accuracy
from sklearn import metrics
print("Accuracy score:",metrics.accuracy_score(y_test, y_pred_class))

Accuracy score: 0.849000029984


## Question 6

Which of the following ranges contains the accuracy of the sentiment_model in the IPython Notebook from lecture on the test_data?

In [17]:
vect_sentiment = CountVectorizer()
vect_sentiment.fit(X_train)
X_train_dtm_sentiment = vect_sentiment.transform(X_train)

sentiment_model = LogisticRegression()
sentiment_model.fit(X_train_dtm_sentiment, y_train)

X_test_dtm_sentiment = vect_sentiment.transform(X_test)

# class predictions
y_pred_class_sentiment = sentiment_model.predict(X_test_dtm_sentiment)

print("Accuracy score:",metrics.accuracy_score(y_test, y_pred_class_sentiment))

Accuracy score: 0.932445803724


## Question 7
Which of the following ranges contains the accuracy of the majority class classifier, which simply predicts the majority class on the test_data?

In [18]:
float(amazon_df.sentiment.value_counts()[True])/float(amazon_df.sentiment.value_counts().sum())


0.8411233448474381

## Question 8
How do you compare the different learned models with the baseline approach where we are just predicting the majority class? 

The model learned using all words performed much better than the other two. The other two approaches performed about the same.

## Question 9
Which of the following ranges contains the ‘predicted_sentiment’ for the most positive review for ‘Baby Trend Diaper Champ’, according to the sentiment_model from the IPython Notebook from lecture?

In [19]:
champ = amazon_df[amazon_df['name'] == 'Baby Trend Diaper Champ']
champ.head()

Unnamed: 0,name,review,rating,sentiment
312,Baby Trend Diaper Champ,Ok - newsflash. Diapers are just smelly. We\...,4,True
314,Baby Trend Diaper Champ,"My husband and I selected the Diaper ""Champ"" m...",1,False
315,Baby Trend Diaper Champ,Excellent diaper disposal unit. I used it in ...,5,True
316,Baby Trend Diaper Champ,We love our diaper champ. It is very easy to ...,5,True
317,Baby Trend Diaper Champ,Two girlfriends and two family members put me ...,5,True


In [105]:
champ_X_train_dtm = vect_sentiment.transform(champ['review'])
champ_y_predict = sentiment_model.predict_proba(champ_X_train_dtm)[:, 1]
champ['predicted sentiment'] = champ_y_predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [106]:
champ.head()

Unnamed: 0,name,review,rating,sentiment,predicted sentiment
312,Baby Trend Diaper Champ,Ok - newsflash. Diapers are just smelly. We\...,4,True,0.880751
314,Baby Trend Diaper Champ,"My husband and I selected the Diaper ""Champ"" m...",1,False,0.07382
315,Baby Trend Diaper Champ,Excellent diaper disposal unit. I used it in ...,5,True,0.999702
316,Baby Trend Diaper Champ,We love our diaper champ. It is very easy to ...,5,True,0.996548
317,Baby Trend Diaper Champ,Two girlfriends and two family members put me ...,5,True,0.974003


In [107]:
champ.sort_values(by=['predicted sentiment'], ascending = False).iloc[0]['predicted sentiment']


0.99999999991473065

## Question 10

Consider the most positive review for ‘Baby Trend Diaper Champ’ according to the sentiment_model from the IPython Notebook from lecture. Which of the following ranges contains the predicted_sentiment for this review, if we use the selected_words_model to analyze it?

In [111]:
review_champ = champ.sort_values(by=['predicted sentiment'], ascending = False).iloc[1]
review_champ

name                                             Baby Trend Diaper Champ
review                 I originally put this item on my baby registry...
rating                                                                 5
sentiment                                                           True
predicted sentiment                                                    1
Name: 320, dtype: object

In [112]:
review_champ_dtm = vect.transform(pd.Series(review_champ['review']))

review_pred_prob = log_reg_model.predict_proba(review_champ_dtm)[:, 1]

review_pred_prob[0]

0.79080389746087387

## Question 11
Why is the value of the predicted_sentiment for the most positive review found using the sentiment_model much more positive than the value predicted using the selected_words_model?

None of the selected_words appeared in the text of this review. 

In [113]:
vect.fit_transform(pd.Series(review_champ['review'])).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])