# Homwork 1

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Read data
products = pd.read_csv('amazon_baby.csv')

### 1.Text cleaning

In [4]:
# remove punctuation in the text 'review'
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [5]:
products = products.fillna({'review': ''})# fill 'NA' in the reviews of products with empty string

In [6]:
products['review_clean'] = products['review'].apply(remove_punctuation)

### 2.Extract sentiments

In [7]:
# rating = 3: netural; rating < 3: -1; rating > 3: +1; we ignore all reviews with rating = 3
products = products[products['rating'] != 3]

In [8]:
# assign sentiment to different ratings
products['sentiment'] = products['rating'].apply(lambda rating: -1 if rating < 3 else +1)

### 3.Split into train/test data sets

In [9]:
# 80-20 split
# import sklearn.cross_validation as cv
# train_data, test_data = cv.train_test_split(products, test_size = 0.8, random_state = 1)

In [10]:
# Here we use the train, test data sets provided by the course
train_data_index = pd.read_json('./module-2-assignment-train-idx.json')
test_data_index = pd.read_json('./module-2-assignment-test-idx.json')

In [11]:
train_data = products.iloc[train_data_index[0]]
test_data = products.iloc[test_data_index[0]]

### 4.Text vectorization 

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') # Use this token pattern to keep words

# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping(that's why we use 'transform' here, 
# because we use the fit_transform above to set the rule we use to vectorize the text)
test_matrix = vectorizer.transform(test_data['review_clean'])

### 5. Logistic regression sentiment model

In [13]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])
print [sentiment_model.coef_, sentiment_model.intercept_]

[array([[ -1.23918127e+00,   1.52240089e-04,   2.64407453e-02, ...,
          1.18173952e-02,   3.07546180e-03,  -6.25038476e-05]]), array([ 1.37441272])]


In [14]:
# Calculate the number of positive coefficients
sum([1 if elm >= 0 else 0 for elm in sentiment_model.coef_[0]])

85930

In [15]:
# Extract three data points as a sample
sample_test_data = test_data[10:13]
print sample_test_data

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  


In [16]:
# Predict the sentiment
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores

[  5.59940322  -3.15235192 -10.41361467]


In [17]:
# Convert scores to lables
print [1 if score > 0 else -1 for score in scores]

[1, -1, -1]


In [18]:
# Check the result with the predicting model
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1])

In [19]:
# Calculate the probability of prediction
prob = [1.0/(1+np.exp(-1*score)) for score in scores]
print prob

[0.99631356885539779, 0.040998706100064924, 3.0020061269346876e-05]


In [20]:
# Check the result with the predicting model
sentiment_model.predict_proba(sample_test_matrix)
# so the third one has the lowest probability of being classified as a positive review

array([[  3.68643114e-03,   9.96313569e-01],
       [  9.59001294e-01,   4.09987061e-02],
       [  9.99969980e-01,   3.00200613e-05]])

In [21]:
# Predict the sentiment of test data 
test_data['prob'] = [elm[1] for elm in sentiment_model.predict_proba(test_matrix)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [22]:
test_data.sort_values('prob', ascending = False)[:20] # top 20 most positive reviews

Unnamed: 0,name,review,rating,review_clean,sentiment,prob
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1,1.0
97325,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1,1.0
168697,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1,1.0
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1,1.0
180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2 Its e...,1,1.0
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1,1.0
80155,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,I just tried this hands free breastpump bra an...,1,1.0
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,1.0
147949,"Baby Jogger City Mini GT Single Stroller, Shad...","Amazing, Love, Love, Love it !!! All 5 STARS a...",5,Amazing Love Love Love it All 5 STARS all the...,1,1.0
133651,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1,1.0


In [23]:
test_data.sort_values('prob', ascending = True)[:20] # top 20 most negative reviews

Unnamed: 0,name,review,rating,review_clean,sentiment,prob
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,9.321135e-16
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,1.823359e-15
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,8.666163e-14
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,1.396e-13
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,1.901778e-13
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,4.242228e-13
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,3.482029e-11
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,4.035321e-11
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,1.007328e-10
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,1.030393e-10


In [24]:
# Calculate the accuracy
def get_classification_accuracy(model, data, true_labels):
    # 1. get the prediction
    predictions = model.predict(data)
    
    # 2. compute the number of correctly classified examples
    num_correct = sum(predictions == true_labels)
    
    # 3. compute the accuracy
    accuracy = float(num_correct)/len(true_labels)
    
    return accuracy

In [25]:
get_classification_accuracy(sentiment_model, test_matrix, test_data['sentiment'])

0.9322054235661147

### 6. Train another classifier with fewer words

In [26]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [27]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [29]:
# Build a simple model with a subset of words counted
simple_model = LogisticRegression()

In [30]:
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

In [32]:
simple_model_coef_table

Unnamed: 0,coefficient,word
0,1.36369,love
1,0.944,great
2,1.192538,easy
3,0.085513,old
4,0.520186,little
5,1.509812,perfect
6,1.673074,loves
7,0.50376,well
8,0.190909,able
9,0.058855,car


In [42]:
# Calculate the number of positive coefficients
sum([1 if elm >= 0 else 0 for elm in simple_model.coef_[0]])

10

### 7. Compare models

In [35]:
# Accuracy on train data
train_acc_1 = get_classification_accuracy(sentiment_model, train_matrix, train_data['sentiment'])
train_acc_2 = get_classification_accuracy(simple_model, train_matrix_word_subset, train_data['sentiment'])
print train_acc_1
print train_acc_2

0.968039815315
0.866822570007


In [36]:
# Accuracy on test data
test_acc_1 = get_classification_accuracy(sentiment_model, test_matrix, test_data['sentiment'])
test_acc_2 = get_classification_accuracy(simple_model, test_matrix_word_subset, test_data['sentiment'])
print test_acc_1
print test_acc_2

0.932205423566
0.869360451164


In [37]:
# Majority class classifier
print 'Accuracy of train data with Majority class classifier'
print (train_data['sentiment'] == +1).sum()
print (train_data['sentiment'] == -1).sum()

Accuracy of train data with Majority class classifier
112164
21252


In [39]:
print (train_data['sentiment'] == +1).sum()/float(len(train_data['sentiment']))

0.840708760568


In [40]:
print 'Accuracy of test data with Majority class classifier'
print (test_data['sentiment'] == +1).sum()
print (test_data['sentiment'] == -1).sum()

Accuracy of test data with Majority class classifier
28095
5241


In [41]:
print (test_data['sentiment'] == +1).sum()/float(len(test_data['sentiment']))

0.842782577394
