In [9]:
import graphlab
from __future__ import division
import graphlab
import math
import string

In [13]:
products = graphlab.SFrame('amazon_baby.csv')
products.head()

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


name,review,rating
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4


In [14]:
products[269]

{'name': 'The First Years Massaging Action Teether',
 'rating': 5,
 'review': 'A favorite in our house!'}

In [15]:
#Remove punctuation using Python's built-in string functionality.
#Transform the reviews into word-counts.

def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

review_without_punctuation = products['review'].apply(remove_punctuation)
products['word_count'] = graphlab.text_analytics.count_words(review_without_punctuation)

In [16]:
products[269]['word_count']

{'a': 1, 'favorite': 1, 'house': 1, 'in': 1, 'our': 1}

In [17]:
products = products[products['rating'] != 3]
len(products)

39073

In [18]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
products

name,review,rating,word_count,sentiment
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5,"{'and': 3, 'love': 1, 'it': 3, 'highly': 1, ...",1
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5,"{'and': 2, 'quilt': 1, 'it': 1, 'comfortable': ...",1
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5,"{'and': 3, 'ingenious': 1, 'love': 2, 'what': 1, ...",1
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5,"{'and': 2, 'all': 2, 'help': 1, 'cried': 1, ...",1
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5,"{'and': 2, 'this': 2, 'her': 1, 'help': 2, ...",1
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4,"{'shop': 1, 'noble': 1, 'is': 1, 'it': 1, 'as': ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5,"{'and': 2, 'all': 1, 'right': 1, 'had': 1, ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5,"{'and': 1, 'fantastic': 1, 'help': 1, 'give': 1, ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4,"{'all': 1, 'standarad': 1, 'another': 1, 'when': ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4,"{'all': 2, 'nannys': 1, 'just': 1, 'food': 1, ...",1


In [19]:
train_data, test_data = products.random_split(.8, seed=1)
print len(train_data)
print len(test_data)

31273
7800


In [20]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                      target = 'sentiment',
                                                      features=['word_count'],
                                                      validation_set=None)

In [21]:
sentiment_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 48851
Number of examples             : 31273
Number of classes              : 2
Number of feature columns      : 1
Number of unpacked features    : 48850

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : lbfgs
Solver iterations              : 10
Solver status                  : TERMINATED: Iteration limit reached.
Training time (sec)            : 5.6731

Settings
--------
Log-likelihood                 : 1130.9967

Highest Positive Coefficients
-----------------------------
word_count[etcget]             : 15.6986
word_count[knobskeeping]       : 15.6986
word_count[themif]             : 15.6986
word_count[directpumping]      : 13.7981
word_count[compound]           : 12.7946

Lowest Negative Coefficients
----------------------------
word_count[infantsyoung]       

In [22]:
weights = sentiment_model.coefficients
weights.column_names()

['name', 'index', 'class', 'value', 'stderr']

In [25]:
num_positive_weights = (weights['value'] >= 0).sum()
num_negative_weights = (weights['value'] <= 0).sum()

print "Number of positive weights: %s " % num_positive_weights
print "Number of negative weights: %s " % num_negative_weights

Number of positive weights: 34638 
Number of negative weights: 14213 


In [23]:
weights

name,index,class,value,stderr
(intercept),,1,0.602905491295,
word_count,recommend,1,0.40183878634,
word_count,moist,1,1.08692666394,
word_count,osocozy,1,-0.204006614657,
word_count,keps,1,3.64666943857,
word_count,leak,1,-0.564606202684,
word_count,holder,1,-0.178745296589,
word_count,was,1,-0.0423603471228,
word_count,now,1,0.0648218060935,
word_count,wipe,1,0.341599408901,


In [26]:
sample_test_data = test_data[10:13]
print sample_test_data['rating']
sample_test_data

[5, 2, 1]


name,review,rating,word_count,sentiment
Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in ...,5,"{'and': 2, 'all': 1, 'love': 1, 'purchased': ...",1
Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The decals ...,2,"{'and': 1, 'would': 2, 'almost': 1, 'decals' ...",-1
New Style Trailing Cherry Blossom Tree Decal ...,Was so excited to get this product for my baby ...,1,"{'all': 1, 'money': 1, 'into': 1, 'back': 1, ...",-1


In [27]:
sample_test_data[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [28]:
sample_test_data[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [29]:
scores = sentiment_model.predict(sample_test_data, output_type='margin')
print scores

[5.538331457380659, -2.3017362662295096, -8.588488531933644]


In [31]:
def class_predictions(scores):
    preds = []
    for score in scores:
        if score > 0:
            pred = 1
        else:
            pred = -1
        preds.append(pred)
    return preds

In [32]:
print "Class predictions according to GraphLab Create:" 
print sentiment_model.predict(sample_test_data)

Class predictions according to GraphLab Create:
[1, -1, -1]


**Checkpoint**: Make sure your class predictions match with the one obtained from GraphLab Create.

### Probability predictions

Recall from the lectures that we can also calculate the probability predictions from the scores using:
$$
P(y_i = +1 | \mathbf{x}_i,\mathbf{w}) = \frac{1}{1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))}.
$$

Using the variable **scores** calculated previously, write code to calculate the probability that a sentiment is positive using the above formula. For each row, the probabilities should be a number in the range **[0, 1]**.

In [33]:
def calculate_proba(scores):
    proba_preds = []
    for score in scores:
        proba_pred =  1 / (1 + math.exp(-score))
        proba_preds.append(proba_pred)
    return proba_preds

calculate_proba(scores)

[0.9960823246612241, 0.09097926624535008, 0.00018620268896356185]

In [34]:
print "Class predictions according to GraphLab Create:" 
print sentiment_model.predict(sample_test_data, output_type='probability')

Class predictions according to GraphLab Create:
[0.9960823246612241, 0.09097926624535009, 0.00018620268896356177]


In [35]:
test_data['proba_pred'] = sentiment_model.predict(test_data, output_type='probability')
test_data

name,review,rating,word_count,sentiment
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4,"{'all': 1, 'standarad': 1, 'another': 1, 'when': ...",1
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4,"{'all': 2, 'nannys': 1, 'just': 1, 'food': 1, ...",1
Nature's Lullabies First Year Sticker Calendar ...,"I love this little calender, you can keep ...",5,"{'and': 1, 'babys': 1, 'love': 1, 'like': 1, ...",1
Nature's Lullabies Second Year Sticker Calendar ...,"I had a hard time finding a second year calendar, ...",5,"{'and': 3, 'all': 1, 'later': 1, 'reference': ...",1
"Lamaze Peekaboo, I Love You ...","One of baby's first and favorite books, and i ...",4,"{'and': 2, 'because': 1, 'just': 1, 'less': 1, ...",1
"Lamaze Peekaboo, I Love You ...",My son loved this book as an infant. It was ...,5,"{'infant': 1, 'being': 1, 'all': 1, 'course': 1, ...",1
"Lamaze Peekaboo, I Love You ...",Our baby loves this book & has loved it for a ...,5,"{'and': 1, 'own': 1, 'it': 3, 'our': 1, 'f ...",1
"SoftPlay Giggle Jiggle Funbook, Happy Bear ...",This bear is absolutely adorable and I would ...,2,"{'and': 3, 'rating': 1, 'have': 1, 'just': 1, ...",-1
SoftPlay Peek-A-Boo Where's Elmo A Childr ...,I bought two for recent baby showers! The book ...,5,"{'and': 2, 'beautiful': 1, 'love': 1, 'elmo': 1, ...",1
Baby's First Year Undated Wall Calendar with ...,I searched high and low for a first year cale ...,5,"{'remembering': 1, 'and': 4, 'year': 1, 'am': 1, ...",1

proba_pred
0.996125873482
0.999999989606
0.994535219239
0.999848652093
0.985958214816
0.999999614686
0.995612760788
0.946737997688
0.997988922916
0.997853687133


In [36]:
test_data['name','proba_pred'].topk('proba_pred', k=20).print_rows(20)

+-------------------------------+----------------+
|              name             |   proba_pred   |
+-------------------------------+----------------+
|  BABYBJORN Potty Chair - Red  |      1.0       |
| Fisher-Price Rainforest Me... |      1.0       |
| Lilly Gold Sit 'n' Stroll ... |      1.0       |
| Fisher-Price Rainforest Me... |      1.0       |
| Itzbeen Pocket Nanny Baby ... |      1.0       |
| Crown Crafts The Original ... |      1.0       |
| Shermag Glider Rocker Comb... |      1.0       |
| Cloud b Sound Machine Soot... |      1.0       |
| Itzbeen Pocket Nanny Baby ... |      1.0       |
| Summer Infant Complete Nur... |      1.0       |
| Britax Decathlon Convertib... |      1.0       |
| Prince Lionheart bebePOD P... |      1.0       |
| Bumkins Waterproof Sleeved... |      1.0       |
| Two Tone U Zip Backpack in... |      1.0       |
| Joovy Caboose Ultralight S... | 0.999999999999 |
| Stork Craft Beatrice Combo... | 0.999999999999 |
| Roundabout Convertible Car...

In [37]:
test_data['name','proba_pred'].topk('proba_pred', k=20, reverse=True).print_rows(20)

+-------------------------------+-------------------+
|              name             |     proba_pred    |
+-------------------------------+-------------------+
| Playtex Diaper Genie Essen... | 8.57691984957e-16 |
| Fisher-Price Ocean Wonders... | 1.47686115403e-14 |
| Evenflo Take Me Too Premie... | 1.99778604388e-14 |
| Cosco Alpha Omega Elite Co... | 1.47572908303e-11 |
| Philips AVENT Newborn Star... | 1.51647379822e-11 |
| Built NY Double Thirsty To... | 5.95310825245e-10 |
|   Sunshine Kids Travel - Bag  | 6.93077866479e-10 |
| Soothing Dreams Monistor w... | 1.16127079591e-09 |
| Playtex Diaper Genie - Fir... | 1.25536543003e-09 |
| Todays Mom Cozy Comfort Pr... | 1.44679783116e-09 |
| Summer Infant Sure And Sec... | 2.50683129482e-09 |
| Safety 1st Deluxe 4-in-1 B... | 2.66837136483e-09 |
| North States Industries Su... | 2.88688779229e-09 |
| Safety 1st Deluxe 4-in-1 B... | 6.27214695019e-09 |
|    Fisher-Price Royal Potty   | 9.28793278329e-09 |
| The First Years - Crib CD 

In [38]:
def get_classification_accuracy(model, data, true_labels):
    # First get the predictions
    predictions = model.predict(data)
    
    # Compute the number of correctly classified examples
    # compare 2 SArray, true = 1, false = 0
    num_correct = sum(predictions == true_labels)

    # Then compute accuracy by dividing num_correct by total number of examples
    accuracy = num_correct/len(data)
    
    return accuracy

In [39]:
get_classification_accuracy(sentiment_model, test_data, test_data['sentiment'])

0.9078205128205128

In [40]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [41]:
#For each review, we will use the word_count column and trim out all words that are not in the significant_words list above. 
#We will use the SArray dictionary trim by keys functionality. 

train_data['word_count_subset'] = train_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)
test_data['word_count_subset'] = test_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)

In [42]:
train_data[0]['review']

'it came early and was not disappointed. i love planet wise bags and now my wipe holder. it keps my osocozy wipes moist and does not leak. highly recommend it.'

In [43]:
print train_data[0]['word_count']

{'and': 3, 'love': 1, 'it': 3, 'highly': 1, 'osocozy': 1, 'bags': 1, 'holder': 1, 'leak': 1, 'moist': 1, 'does': 1, 'recommend': 1, 'was': 1, 'wipes': 1, 'early': 1, 'not': 2, 'now': 1, 'disappointed': 1, 'wipe': 1, 'keps': 1, 'wise': 1, 'i': 1, 'planet': 1, 'my': 2, 'came': 1}


In [44]:

print train_data[0]['word_count_subset']

{'love': 1, 'disappointed': 1}


In [45]:
simple_model = graphlab.logistic_classifier.create(train_data,
                                                   target = 'sentiment',
                                                   features=['word_count_subset'],
                                                   validation_set=None)
simple_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 21
Number of examples             : 31273
Number of classes              : 2
Number of feature columns      : 1
Number of unpacked features    : 20

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : newton
Solver iterations              : 6
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 0.5861

Settings
--------
Log-likelihood                 : 11391.195

Highest Positive Coefficients
-----------------------------
word_count_subset[loves]       : 1.6355
word_count_subset[love]        : 1.4774
word_count_subset[perfect]     : 1.4251
word_count_subset[easy]        : 1.1648
(intercept)                    : 1.1391

Lowest Negative Coefficients
----------------------------
word_count_subset[return]      : -2.0785
word_

In [46]:
get_classification_accuracy(simple_model, test_data, test_data['sentiment'])

0.8535897435897436

In [47]:
simple_model.coefficients

name,index,class,value,stderr
(intercept),,1,1.13911230453,0.0244773381974
word_count_subset,disappointed,1,-2.0446718677,0.103530711219
word_count_subset,love,1,1.47741597064,0.0646799851689
word_count_subset,little,1,0.493565008591,0.0401590179376
word_count_subset,loves,1,1.63553989235,0.0867845277101
word_count_subset,product,1,-0.268076763177,0.0298715870545
word_count_subset,well,1,0.387840978148,0.0403144904283
word_count_subset,great,1,0.834495259316,0.0384853770981
word_count_subset,easy,1,1.164782154,0.0545977880135
word_count_subset,work,1,-0.635323722559,0.0437818156538


In [48]:
simple_model.coefficients.sort('value', ascending=False).print_rows(num_rows=21)

+-------------------+--------------+-------+-----------------+-----------------+
|        name       |    index     | class |      value      |      stderr     |
+-------------------+--------------+-------+-----------------+-----------------+
| word_count_subset |    loves     |   1   |  1.63553989235  | 0.0867845277101 |
| word_count_subset |     love     |   1   |  1.47741597064  | 0.0646799851689 |
| word_count_subset |   perfect    |   1   |  1.42506229654  | 0.0999344395467 |
| word_count_subset |     easy     |   1   |   1.164782154   | 0.0545977880135 |
|    (intercept)    |     None     |   1   |  1.13911230453  | 0.0244773381974 |
| word_count_subset |    great     |   1   |  0.834495259316 | 0.0384853770981 |
| word_count_subset |    little    |   1   |  0.493565008591 | 0.0401590179376 |
| word_count_subset |     well     |   1   |  0.387840978148 | 0.0403144904283 |
| word_count_subset |     able     |   1   |  0.196761187436 | 0.0628976886185 |
| word_count_subset |     ca

In [49]:
simple_weights = simple_model.coefficients
positive_significant_words = simple_weights[(simple_weights['value'] > 0) & (simple_weights['name'] == "word_count_subset")]['index']
print len(positive_significant_words)
print positive_significant_words

10
['love', 'little', 'loves', 'well', 'great', 'easy', 'able', 'perfect', 'old', 'car']


In [50]:
weights.filter_by(positive_significant_words, 'index')

name,index,class,value,stderr
word_count,love,1,0.885352282875,
word_count,little,1,0.363536495549,
word_count,loves,1,0.854092819124,
word_count,well,1,0.381467081571,
word_count,great,1,0.661312555676,
word_count,easy,1,0.723353826409,
word_count,able,1,0.129645288035,
word_count,perfect,1,0.987987336315,
word_count,old,1,0.0665138791519,
word_count,car,1,0.124593397139,


In [51]:
get_classification_accuracy(sentiment_model, train_data, train_data['sentiment'])

0.9924215777187989

In [52]:
get_classification_accuracy(simple_model, train_data, train_data['sentiment'])

0.8469606369711892

In [53]:
get_classification_accuracy(sentiment_model, test_data, test_data['sentiment'])

0.9078205128205128

In [54]:
get_classification_accuracy(simple_model, test_data, test_data['sentiment'])

0.8535897435897436

In [55]:
num_positive  = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print num_positive
print num_negative

25585
5688


In [56]:
print (test_data['sentiment'] == +1).sum()
print (test_data['sentiment'] == -1).sum()

6413
1387


In [57]:
print (test_data['sentiment'] == +1).sum()/len(test_data['sentiment'])

0.822179487179
