In [None]:
from __future__ import division
import graphlab as gl
import math
import string
import numpy

Dataset consisting of baby product reviews on Amazon.com

In [4]:
products = gl.SFrame('amazon_baby.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to sandeepreddy.vorugnati.2016@anderson.ucla.edu and will expire on August 01, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\sandeep\AppData\Local\Temp\graphlab_server_1470181823.log.0


Preview of dataset

In [9]:
products

name,review,rating
Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3.0
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0


Function to remove punctuation

In [5]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

review_without_puctuation = products['review'].apply(remove_punctuation)
products['word_count'] = gl.text_analytics.count_words(review_without_puctuation)

Ignoring all reviews with rating == 3 (neutral statement)

In [6]:
products = products[products['rating'] != 3]

Used +1 for the positive class (rating >3) label and -1 for the negative class label (rating <3)

In [7]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

Dividing the data into training and test data in the proportion of 80:20

In [8]:
train_data, test_data = products.random_split(.8)
print len(train_data)
print len(test_data)

133416
33336


Training data with logistic regression

In [9]:
sentiment_model = gl.logistic_classifier.create(train_data,
                                                      target = 'sentiment',
                                                      features=['word_count'],
                                                      validation_set=None)

In [12]:
sentiment_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 121713
Number of examples             : 133416
Number of classes              : 2
Number of feature columns      : 1
Number of unpacked features    : 121712

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : lbfgs
Solver iterations              : 6
Solver status                  : TERMINATED: Terminated due to numerical difficulties.
Training time (sec)            : 7.1487

Settings
--------
Log-likelihood                 : inf

Highest Positive Coefficients
-----------------------------
word_count[mobileupdate]       : 41.9847
word_count[placeid]            : 41.7354
word_count[labelbox]           : 41.151
word_count[httpwwwamazoncomreviewrhgg6qp7tdnhbrefcmcrprcmtieutf8asinb00318cla0nodeid] : 40.0454
word_count[knobskeeping]       : 36.2091

Lowest Negative Coeffi

In [10]:
weights = sentiment_model.coefficients
weights.column_names()

['name', 'index', 'class', 'value', 'stderr']

In [11]:
num_positive_weights = sum(weights['value']>=0)
num_negative_weights = sum(weights['value']<0)

print "Number of positive weights: %s " % num_positive_weights
print "Number of negative weights: %s " % num_negative_weights
print "Total weights: %s " % (num_negative_weights + num_positive_weights)

Number of positive weights: 68419 
Number of negative weights: 53294 
Total weights: 121713 


In [None]:
sample_test_data = test_data[10:13]
print sample_test_data['rating']
sample_test_data

In [13]:
scores = sentiment_model.predict(sample_test_data, output_type='margin')
print scores

[6.734619727059891, -5.734130996760666, -14.66846040446922]


In [14]:
y = scores.apply(lambda scores : +1 if scores > 0 else -1)
y

dtype: int
Rows: 3
[1L, -1L, -1L]

In [15]:
print "Class predictions according to GraphLab Create:" 
print sentiment_model.predict(sample_test_data)

Class predictions according to GraphLab Create:
[1L, -1L, -1L]


In [16]:
P = 1 / (1 + numpy.exp(-scores))
print P
P[0]

[  9.98812385e-01   3.22326818e-03   4.26155800e-07]


0.9988123848377205

In [17]:
mP = sentiment_model.predict(sample_test_data, output_type='probability')
print mP
mP[0]

[0.9988123848377205, 0.0032232681817993686, 4.261557996652428e-07]


0.9988123848377205

In [18]:
print "Class predictions according to GraphLab Create:" 
print sentiment_model.predict(sample_test_data, output_type='probability')
print P == sentiment_model.predict(sample_test_data, output_type='probability')

Class predictions according to GraphLab Create:
[0.9988123848377205, 0.0032232681817993686, 4.261557996652428e-07]
[ True False False]


In [19]:
# scores = sentiment_model.predict(test_data, output_type='margin')
test_data['probability'] = sentiment_model.predict(test_data, output_type='probability')
test_data['margin'] = sentiment_model.predict(test_data, output_type='margin')

In [21]:
top20 = test_data.topk('probability', 20)
bottom20 = test_data.topk('probability', 20, True)

In [22]:
top20['name']

dtype: str
Rows: 20
['Britax Decathlon Convertible Car Seat, Tiffany', 'Ameda Purely Yours Breast Pump - Carry All', 'Traveling Toddler Car Seat Travel Accessory', 'Shermag Glider Rocker Combo, Pecan with Oatmeal', 'Cloud b Sound Machine Soother, Sleep Sheep', 'JP Lizzy Chocolate Ice Classic Tote Set', 'Fisher-Price Rainforest Melodies and Lights Deluxe Gym', "Lilly Gold Sit 'n' Stroll 5 in 1 Car Seat and Stroller Combination, Tuxedo Black (sunshade is not included in the offering)", 'Fisher-Price Deluxe Jumperoo', 'North States Supergate Pressure Mount Clear Choice Wood Gate', 'Munchkin Mozart Magic Cube', 'Britax Marathon Convertible Car Seat, Granite', 'Wizard Convertible Car Seat with LATCH in Midnight Print', 'Capri Stroller - Red Tech', 'Peg Perego Primo Viaggio Car Seat / Infant Carrier with LATCH Base - Black Sable', 'HALO SleepSack Micro-Fleece Wearable Blanket, Soft Pink, Small', 'Leachco Snoogle Total Body Pillow', 'Summer Infant Complete Nursery Care Kit', 'Safety 1st Tot-L

In [23]:
bottom20['name']

dtype: str
Rows: 20
['Jolly Jumper Arctic Sneak A Peek Infant Car Seat Cover Black', "Levana Safe N'See Digital Video Baby Monitor with Talk-to-Baby Intercom and Lullaby Control (LV-TW501)", 'Snuza Portable Baby Movement Monitor', 'Fisher-Price Ocean Wonders Aquarium Bouncer', 'VTech Communications Safe &amp; Sounds Full Color Video and Audio Monitor', 'Safety 1st High-Def Digital Monitor', 'Chicco Cortina KeyFit 30 Travel System in Adventure', 'Prince Lionheart Warmies Wipes Warmer', 'Valco Baby Tri-mode Twin Stroller EX- Hot Chocolate', 'Adiri BPA Free Natural Nurser Ultimate Bottle Stage 1 White, Slow Flow (0-3 months)', 'Munchkin Nursery Projector and Sound System, White', 'The First Years True Choice P400 Premium Digital Monitor, 2 Parent Unit', 'Nuby Natural Touch Silicone Travel Infa Feeder, Colors May Vary, 3 Ounce', 'Peg-Perego Tatamia High Chair, White Latte', 'Fisher-Price Royal Potty', 'Safety 1st Exchangeable Tip 3 in 1 Thermometer', 'Safety 1st Lift Lock and Swing Gate', 

In [25]:
def get_classification_accuracy(model, data, true_labels):
    # First get the predictions
    ## YOUR CODE HERE
    scores = model.predict(data, output_type='margin')
    y = scores.apply(lambda scores : +1 if scores > 0 else -1)
    
    # Compute the number of correctly classified examples
    ## YOUR CODE HERE
    correct = y==true_labels

    # Then compute accuracy by dividing num_correct by total number of examples
    ## YOUR CODE HERE
    accuracy = sum(correct) / len(correct)
    
    return accuracy

In [26]:
a = get_classification_accuracy(sentiment_model, test_data, test_data['sentiment'])
print (a)

0.914536837053


In [27]:
get_classification_accuracy(sentiment_model, train_data, train_data['sentiment'])

0.979440247046831

In [28]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [29]:
train_data['word_count_subset'] = train_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)
test_data['word_count_subset'] = test_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)

In [33]:
simple_model = gl.logistic_classifier.create(train_data,
                                                   target = 'sentiment',
                                                   features=['word_count_subset'],
                                                   validation_set=None)
simple_model

Class                          : LogisticClassifier

Schema
------
Number of coefficients         : 21
Number of examples             : 133416
Number of classes              : 2
Number of feature columns      : 1
Number of unpacked features    : 20

Hyperparameters
---------------
L1 penalty                     : 0.0
L2 penalty                     : 0.01

Training Summary
----------------
Solver                         : newton
Solver iterations              : 6
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 1.1736

Settings
--------
Log-likelihood                 : 44323.7254

Highest Positive Coefficients
-----------------------------
word_count_subset[loves]       : 1.6773
word_count_subset[perfect]     : 1.5145
word_count_subset[love]        : 1.3654
(intercept)                    : 1.2995
word_count_subset[easy]        : 1.1937

Lowest Negative Coefficients
----------------------------
word_count_subset[disappointed] : -2.3551
wo

In [34]:
get_classification_accuracy(simple_model, test_data, test_data['sentiment'])

0.8693004559635229

In [35]:
simple_model.coefficients.sort('value', ascending=False).print_rows(num_rows=21)

+-------------------+--------------+-------+-----------------+-----------------+
|        name       |    index     | class |      value      |      stderr     |
+-------------------+--------------+-------+-----------------+-----------------+
| word_count_subset |    loves     |   1   |  1.67727145556  | 0.0482328275384 |
| word_count_subset |   perfect    |   1   |  1.51448626703  |  0.049861952294 |
| word_count_subset |     love     |   1   |  1.36543549368  | 0.0303546295109 |
|    (intercept)    |     None     |   1   |   1.2995449552  | 0.0120888541331 |
| word_count_subset |     easy     |   1   |  1.19366189833  |  0.029288869202 |
| word_count_subset |    great     |   1   |  0.94469126948  | 0.0209509926591 |
| word_count_subset |    little    |   1   |  0.520628636025 | 0.0214691475665 |
| word_count_subset |     well     |   1   |  0.504256746398 |  0.021381300631 |
| word_count_subset |     able     |   1   |  0.191438302295 | 0.0337581955697 |
| word_count_subset |     ol

In [49]:
isPos = simple_model.coefficients[simple_model.coefficients['name'] != '(intercept)']['value'] >= 0
pos = sum(isPos)
a = simple_model.coefficients[simple_model.coefficients['name'] != '(intercept)']
positive_significant_words =  a[a['value'] >= 0]['index']
print (pos,positive_significant_words)

(10L, dtype: str
Rows: ?
['love', 'well', 'loves', 'little', 'easy', 'great', 'able', 'perfect', 'old', 'car', ... ])


In [50]:
for c in simple_model.coefficients:
# for c in sentiment_model.coefficients:
    if (c['value'] >=0 and c['index'] != None):
#         print c['index']
        for s in sentiment_model.coefficients:
            if( c['index'] == s['index']):
                print (s['index'], " = ",s['value'] >=0)
#                 print (c)
#                 print (s)

('love', ' = ', True)
('well', ' = ', True)
('loves', ' = ', True)
('little', ' = ', True)
('easy', ' = ', True)
('great', ' = ', True)
('able', ' = ', True)
('perfect', ' = ', True)
('old', ' = ', True)
('car', ' = ', True)


In [52]:
print(get_classification_accuracy(sentiment_model, train_data, train_data['sentiment']))
print(get_classification_accuracy(simple_model, train_data, train_data['sentiment']))
print(get_classification_accuracy(sentiment_model,test_data, test_data['sentiment']))
print(get_classification_accuracy(simple_model,test_data, test_data['sentiment']))

0.979440247047
0.866815074654
0.914536837053
0.869300455964
