#Implementing logistic regression from scratch

In [1]:
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('/Users/April/Downloads/amazon_baby_subset.csv')

In [3]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [4]:
products[:10]

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,"Try this out for a spring project !Easy ,fun a...",5,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,very nice Divine Mercy Pendant of Jesus now on...,5,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1


In [5]:
products = products[products['rating'] != 3]

In [6]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [7]:
products[:10]

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,"Try this out for a spring project !Easy ,fun a...",5,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,very nice Divine Mercy Pendant of Jesus now on...,5,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1


#Apply text cleaning on the review data

In [8]:
import json
with open('/Users/April/Desktop/datasci_course_materials-master/assignment1/important words.json', 'r') as f: # Reads the list of most frequent words
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

In [9]:
print important_words

['baby', 'one', 'great', 'love', 'use', 'would', 'like', 'easy', 'little', 'seat', 'old', 'well', 'get', 'also', 'really', 'son', 'time', 'bought', 'product', 'good', 'daughter', 'much', 'loves', 'stroller', 'put', 'months', 'car', 'still', 'back', 'used', 'recommend', 'first', 'even', 'perfect', 'nice', 'bag', 'two', 'using', 'got', 'fit', 'around', 'diaper', 'enough', 'month', 'price', 'go', 'could', 'soft', 'since', 'buy', 'room', 'works', 'made', 'child', 'keep', 'size', 'small', 'need', 'year', 'big', 'make', 'take', 'easily', 'think', 'crib', 'clean', 'way', 'quality', 'thing', 'better', 'without', 'set', 'new', 'every', 'cute', 'best', 'bottles', 'work', 'purchased', 'right', 'lot', 'side', 'happy', 'comfortable', 'toy', 'able', 'kids', 'bit', 'night', 'long', 'fits', 'see', 'us', 'another', 'play', 'day', 'money', 'monitor', 'tried', 'thought', 'never', 'item', 'hard', 'plastic', 'however', 'disappointed', 'reviews', 'something', 'going', 'pump', 'bottle', 'cup', 'waste', 'retu

Let us perform 2 simple data transformations:

Remove punctuation
Compute word counts (only for important_words)
We start with the first item as follows:

If your tool supports it, fill n/a values in the review column with empty strings. The n/a values indicate empty reviews. For instance, Pandas's the fillna() method lets you replace all N/A's in the review columns as follows:

In [10]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [11]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

Apply the remove_punctuation function on every element of the review column and assign the result to the new column review_clean.

In [12]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [13]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

First create a column called contains_perfect which is set to 1 if the count of the word perfect (stored in column perfect is >= 1.
Sum the number of 1s in the column contains_perfect.

In [14]:
products['contains_perfect'] = products['perfect'].apply(lambda pf: 1 if pf >=1 else 0)

In [15]:
sum(products['contains_perfect'] == 1)

2955

#Convert data frame to multi-dimensional array

In [16]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array) #Why do we need transpose it to matrix?

In [17]:
feature_matrix, label_array = get_numpy_data(products, important_words, 'sentiment')

In [18]:
feature_matrix.shape

(53072, 194)

Estimating conditional probability with link function

P(yi=+1|xi,w)=11+exp(−wTh(xi))

where the feature vector h(x_i) represents the word counts of important_words in the review x_i. Write a function named predict_probability that implements the link function.

Take two parameters: feature_matrix and coefficients.
First compute the dot product of feature_matrix and coefficients.
Then compute the link function P(y = +1 | x,w).
Return the predictions given by the link function.
Your code should be analogous to the following Python function:

In [19]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = 1/(1+np.exp(-score))
    
    # return predictions
    return predictions

In [20]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
        # Return the derivative
    return derivative

In [21]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

#Taking gradient steps

In [22]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = np.dot(errors, feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] = step_size * derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

Now, let us run the logistic regression solver with the parameters below:


In [23]:
feature_matrix = feature_matrix 
sentiment = label_array
initial_coefficients = np.zeros(194)
step_size = 1e-7
max_iter = 301

In [24]:
variable_coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36780.92075691
iteration   2: log likelihood of observed labels = -36780.92075238
iteration   3: log likelihood of observed labels = -36780.92075240
iteration   4: log likelihood of observed labels = -36780.92075240
iteration   5: log likelihood of observed labels = -36780.92075240
iteration   6: log likelihood of observed labels = -36780.92075240
iteration   7: log likelihood of observed labels = -36780.92075240
iteration   8: log likelihood of observed labels = -36780.92075240
iteration   9: log likelihood of observed labels = -36780.92075240
iteration  10: log likelihood of observed labels = -36780.92075240
iteration  11: log likelihood of observed labels = -36780.92075240
iteration  12: log likelihood of observed labels = -36780.92075240
iteration  13: log likelihood of observed labels = -36780.92075240
iteration  14: log likelihood of observed labels = -36780.9207

#Predicting sentiments

In [25]:
scores_new = np.dot(feature_matrix, variable_coefficients)

In [26]:
predicted_sentiment = np.array([+1 if s > 0 else -1 for s in scores_new])

In [27]:
sum(predicted_sentiment == +1)

21403

#Measuring accuracy

In [30]:
float(sum(predicted_sentiment == sentiment))/len(sentiment)

0.7411441061199879

#Which words contribute most to positive & negative sentiments

In [31]:
coefficients = list(variable_coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [40]:
word_coefficient_tuples[:10]

[('great', 0.0002301109574954872),
 ('love', 0.00022919083832962888),
 ('easy', 0.00022415562487307215),
 ('little', 0.00015544207691545753),
 ('loves', 0.00015417337068725665),
 ('well', 0.00010086492750064694),
 ('perfect', 0.00010084183986166354),
 ('old', 6.7367412681327717e-05),
 ('nice', 6.1407869651496219e-05),
 ('soft', 5.9646857678954846e-05)]

In [44]:
word_coefficient_tuples_negative  = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)

In [45]:
word_coefficient_tuples_negative[:10]

[('would', -0.00021171537956537022),
 ('product', -0.00015748674652922082),
 ('money', -0.00013753480269165246),
 ('work', -0.00012016286066171858),
 ('get', -0.00011269149624664065),
 ('even', -0.00011109959075874408),
 ('back', -0.00010483521247037156),
 ('disappointed', -9.9820964146925034e-05),
 ('monitor', -9.3995419225179226e-05),
 ('return', -9.2913592987661845e-05)]