In [1]:
import numpy as np
import pandas as pd

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [4]:
products[0:10]

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,"Try this out for a spring project !Easy ,fun a...",5,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,very nice Divine Mercy Pendant of Jesus now on...,5,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1


In [5]:
len(products)/2

26536.0

In [6]:
np.count_nonzero(products['sentiment'] == 1)

26579

In [7]:
import json
imp_w = open('important_words.json','r')
imp_words = json.loads(imp_w.read())

In [8]:
import string
def remove_punctuation(review):
    review = review.lower()
    translator = str.maketrans('','',string.punctuation)
    return review.translate(translator)
    
    

In [9]:
products = products.fillna({'review':''})

In [10]:
products['clean_review'] = products['review'].apply(remove_punctuation)

In [11]:
products['review'][2]

'My daughter had her 1st baby over a year ago. She did receive and fill up a First Year Calendar. When her son was nearing his first birthday she was looking for a Second Year Calendar to record his milestones. Thanks to Amazon I was able to get this for her and she LOVES it. Tender sweet art work - helpful stickers - unique pages to fill. A nice keepsake. A wonderful gift for a one-year old!'

In [12]:
products['clean_review'][2]

'my daughter had her 1st baby over a year ago she did receive and fill up a first year calendar when her son was nearing his first birthday she was looking for a second year calendar to record his milestones thanks to amazon i was able to get this for her and she loves it tender sweet art work  helpful stickers  unique pages to fill a nice keepsake a wonderful gift for a oneyear old'

In [13]:
import sklearn

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer(token_pattern= r'\b\w+\b',vocabulary=imp_words)

In [16]:
products_matrix = vectorizer.fit_transform(products['clean_review'])

In [17]:
products_matrix

<53072x193 sparse matrix of type '<class 'numpy.int64'>'
	with 682501 stored elements in Compressed Sparse Row format>

In [18]:
for word in imp_words:
    products[word] = products['clean_review'].apply(lambda a:a.split().count(word))

In [19]:
products.head(1).shape

(1, 198)

In [20]:
np.count_nonzero(products['perfect'])

3309

In [21]:
np.count_nonzero(products['perfect']>0)

3309

In [22]:
def toarray(dataframe,feature,label):
    features = ['constant'] +feature
    dataframe['constant'] = 1
    df = dataframe[features]
    feature_matrix = df.as_matrix()
    class_labels = dataframe[label].as_matrix()
    
    return feature_matrix,class_labels

In [24]:
feature_matrix.shape

(53072, 194)

In [30]:
def sigmoid(feature_matrix,co_eff):
    s = 1/(1+np.exp(-np.dot(feature_matrix,co_eff)))
    return s

In [50]:

dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])

correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),          1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_predictions = np.array( [ 1./(1+np.exp(-correct_scores[0])), 1./(1+np.exp(-correct_scores[1])) ] )

print('The following outputs must match ')
print('------------------------------------------------')
print('correct_predictions           =', correct_predictions)
print('output of predict_probability =', sigmoid(dummy_feature_matrix, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_predictions           = [0.98201379 0.26894142]
output of predict_probability = [0.98201379 0.26894142]


In [27]:
def feature_derivative(feature,error):
    derivative = np.dot(feature,error)
    return derivative

In [28]:
def compute_log_likelihood(feature_matrix,sentiment,co_eff):
    indicator = (sentiment == +1)
    score = np.dot(feature_matrix,co_eff)
    log_likelihood = np.sum((indicator - 1)*score - np.log(1+np.exp(-score)))
    return log_likelihood
                        
    

In [49]:
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])
dummy_sentiment = np.array([-1, 1])

correct_indicators  = np.array( [ -1==+1,                                       1==+1 ] )
correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),                     1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_first_term  = np.array( [ (correct_indicators[0]-1)*correct_scores[0],  (correct_indicators[1]-1)*correct_scores[1] ] )
correct_second_term = np.array( [ np.log(1. + np.exp(-correct_scores[0])),      np.log(1. + np.exp(-correct_scores[1])) ] )

correct_ll          =      sum( [ correct_first_term[0]-correct_second_term[0], correct_first_term[1]-correct_second_term[1] ] ) 

print('The following outputs must match ')
print('------------------------------------------------')
print('correct_log_likelihood           =', correct_ll)
print('output of compute_log_likelihood =', compute_log_likelihood(dummy_feature_matrix, dummy_sentiment, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_log_likelihood           = -5.331411615436032
output of compute_log_likelihood = -5.331411615436032


In [46]:
def logistic_regression(feature_matrix,initial_coeffs,sentiment,learning_rate,max_iter):
    co_effs = np.array(initial_coeffs)
    derivative = np.zeros(co_effs.shape)
    for itr in range(max_iter):
        
        prediction = sigmoid(feature_matrix,co_effs)
        indicator = (sentiment==+1)
        error = indicator - prediction
        
        for j in range(len(co_effs)):
            
            derivative[j] = feature_derivative(feature_matrix[:,j],error)
        
        co_effs = co_effs + learning_rate * derivative
        
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, co_effs)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return co_effs

In [47]:
feature_matrix,class_labels = toarray(products,imp_words,'sentiment')
initial_coefficients = np.zeros(feature_matrix.shape[1])
co_efficients = logistic_regression(feature_matrix=feature_matrix,initial_coeffs=initial_coefficients,sentiment=class_labels,learning_rate=1e-7,max_iter=301)

iteration   0: log likelihood of observed labels = -36779.70627863
iteration   1: log likelihood of observed labels = -36772.71312191
iteration   2: log likelihood of observed labels = -36765.72767668
iteration   3: log likelihood of observed labels = -36758.74992322
iteration   4: log likelihood of observed labels = -36751.77984195
iteration   5: log likelihood of observed labels = -36744.81741345
iteration   6: log likelihood of observed labels = -36737.86261842
iteration   7: log likelihood of observed labels = -36730.91543769
iteration   8: log likelihood of observed labels = -36723.97585224
iteration   9: log likelihood of observed labels = -36717.04384317
iteration  10: log likelihood of observed labels = -36710.11939171
iteration  11: log likelihood of observed labels = -36703.20247923
iteration  12: log likelihood of observed labels = -36696.29308721
iteration  13: log likelihood of observed labels = -36689.39119727
iteration  14: log likelihood of observed labels = -36682.4967

In [51]:
def compute_score(feature_matrix,co_efficients):
    product = np.dot(feature_matrix,co_efficients)
    score = (product>0)
    return score

In [53]:
score = compute_score(feature_matrix,co_efficients)

In [54]:
np.count_nonzero(score)

25237

In [55]:
np.count_nonzero(score)

0.475523816701839

In [56]:
score == products[]

53072

In [57]:
feature_matrix.shape

(53072, 194)

In [59]:
indicator = (class_labels==+1)

In [60]:
np.count_nonzero(indicator == score)

40772

In [61]:
np.count_nonzero(indicator == score)/len(products)

0.768239372927344

In [None]:
words = ['constant']+imp_words

In [69]:

pd.DataFrame(co_efficients,words).sort_values(by=0,ascending=False)[0:10]

Unnamed: 0,0
love,0.084119
great,0.083112
easy,0.073504
loves,0.048082
little,0.045369
perfect,0.034438
well,0.027659
nice,0.020781
old,0.019748
fits,0.019133


In [71]:

pd.DataFrame(co_efficients,words).sort_values(by=0)[0:10]

Unnamed: 0,0
would,-0.052591
product,-0.042707
money,-0.040286
work,-0.033182
even,-0.033105
disappointed,-0.030485
get,-0.029491
back,-0.02822
waste,-0.027197
return,-0.026807
