# Implementing logistic regression from scratch

In [2]:
import numpy as np
import pandas as pd

In [3]:
products = pd.read_csv('amazon_baby_subset.csv')

* One column of this dataset is sentiment, corresponding to the class label with +1 indicating a review with positive sentiment and -1 for negative sentiment.

* The name column indicates the name of the product.

In [4]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


Count positive and negative review

In [5]:
products['sentiment'].value_counts()

 1    26579
-1    26493
Name: sentiment, dtype: int64

In [6]:
products['name'].head()

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
Name: name, dtype: object

In [7]:
import json

In [65]:
with open('important_words.json') as w:
    important_words = json.load(w)

In [66]:
important_words

['baby',
 'one',
 'great',
 'love',
 'use',
 'would',
 'like',
 'easy',
 'little',
 'seat',
 'old',
 'well',
 'get',
 'also',
 'really',
 'son',
 'time',
 'bought',
 'product',
 'good',
 'daughter',
 'much',
 'loves',
 'stroller',
 'put',
 'months',
 'car',
 'still',
 'back',
 'used',
 'recommend',
 'first',
 'even',
 'perfect',
 'nice',
 'bag',
 'two',
 'using',
 'got',
 'fit',
 'around',
 'diaper',
 'enough',
 'month',
 'price',
 'go',
 'could',
 'soft',
 'since',
 'buy',
 'room',
 'works',
 'made',
 'child',
 'keep',
 'size',
 'small',
 'need',
 'year',
 'big',
 'make',
 'take',
 'easily',
 'think',
 'crib',
 'clean',
 'way',
 'quality',
 'thing',
 'better',
 'without',
 'set',
 'new',
 'every',
 'cute',
 'best',
 'bottles',
 'work',
 'purchased',
 'right',
 'lot',
 'side',
 'happy',
 'comfortable',
 'toy',
 'able',
 'kids',
 'bit',
 'night',
 'long',
 'fits',
 'see',
 'us',
 'another',
 'play',
 'day',
 'money',
 'monitor',
 'tried',
 'thought',
 'never',
 'item',
 'hard',
 'plasti

In [68]:
len(important_words)

193

In [12]:
# important_words = pd.read_json('important_words.json')

In [13]:
# important_words

In [14]:
products = products.fillna({'review':''})

In [15]:
products['review']

0        All of my kids have cried non-stop when I trie...
1        We wanted to get something to keep track of ou...
2        My daughter had her 1st baby over a year ago. ...
3        One of baby's first and favorite books, and it...
4        Very cute interactive book! My son loves this ...
                               ...                        
53067    My infant goes to a really crappy daycare, and...
53068    Pretty please open and inspect these stickers ...
53069    Great 5-Star Product but An Obvious knock-off ...
53070    When I received the item my initial thought wa...
53071    I got this case in the mail today, it came on ...
Name: review, Length: 53072, dtype: object

In [16]:
import string

In [17]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
# functoin to remove puntuation from text
def remove_punctuation(text):
    import string
    return text.translate(str.maketrans('','',string.punctuation))

In [19]:
text = 'hey! I am Rituraj. How are you??'

In [20]:
remove_punctuation(text)

'hey I am Rituraj How are you'

In [21]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [22]:
products['review_clean']

0        All of my kids have cried nonstop when I tried...
1        We wanted to get something to keep track of ou...
2        My daughter had her 1st baby over a year ago S...
3        One of babys first and favorite books and it i...
4        Very cute interactive book My son loves this b...
                               ...                        
53067    My infant goes to a really crappy daycare and ...
53068    Pretty please open and inspect these stickers ...
53069    Great 5Star Product but An Obvious knockoff of...
53070    When I received the item my initial thought wa...
53071    I got this case in the mail today it came on t...
Name: review_clean, Length: 53072, dtype: object

In [69]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [70]:
products.head()

Unnamed: 0,name,review,rating,sentiment,review_clean,baby,one,great,love,use,...,completely,wish,buying,babies,won,tub,almost,either,contain_perfect,constant
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,All of my kids have cried nonstop when I tried...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0.0,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1,We wanted to get something to keep track of ou...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1,My daughter had her 1st baby over a year ago S...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1,One of babys first and favorite books and it i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1,Very cute interactive book My son loves this b...,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0.0,1


In [71]:
products.loc[products['perfect']>= 1, 'contain_perfect'] = 1

In [72]:
products.loc[products['perfect']<1, 'contain_perfect'] = 0

In [73]:
products['contain_perfect'].sum()

2955.0

In [75]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    feature_frame = ['constant']+features
    feature_matrix = dataframe[feature_frame].to_numpy()
    label_sarray = dataframe[label]
    label_array = label_sarray.to_numpy()
    return (feature_matrix, label_array)

In [76]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

In [77]:
feature_matrix

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [78]:
sentiment

array([ 1,  1,  1, ..., -1, -1, -1])

In [79]:
print(feature_matrix.shape)

(53072, 194)


In [80]:
def predict_probability(feature_matrix, coefficients):
    score = np.dot(feature_matrix, coefficients)
    
    prediction = 1/(1+np.exp(-score))
    return prediction

In [81]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(np.transpose(errors), feature)
        # Return the derivative
    return derivative

In [95]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    # scores.shape (53072L, 1L)
    # indicator.shape (53072L,)
    lp = np.sum(((np.array([indicator]))-1)*scores - np.log(1 + np.exp(-scores)))
    return lp

In [83]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in range(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative =  feature_derivative(errors, feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] += step_size*derivative 

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

14. Now, let us run the logistic regression solver with the parameters below:

feature_matrix = feature_matrix extracted in #9
sentiment = sentiment extracted in #9

initial_coefficients = a 194-dimensional vector filled with zeros

step_size = 1e-7

max_iter = 301

Save the returned coefficients to variable coefficients.

In [84]:
feature_matrix.shape

(53072, 194)

In [85]:
initial_coefficients = np.zeros(194)
step_size = 1e-7
max_iter = 301


In [86]:
score = np.dot(feature_matrix, initial_coefficients)

In [87]:
score.shape

(53072,)

In [97]:
compute_log_likelihood(feature_matrix, sentiment, initial_coefficients)

-36786.70716667741

In [100]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [102]:
coefficients.shape

(194,)

In [94]:
predictions = predict_probability(feature_matrix, coefficients)
NumPositive = (predictions > 0.5).sum()
print NumPositive

score = np.dot(feature_matrix, coefficients) # N * 1
print (score > 0).sum()


array([1, 1, 1, ..., 0, 0, 0])

In [103]:
predictions = predict_probability(feature_matrix, coefficients)

In [108]:
num_positive = (predictions>0.5).sum()
print(num_positive)

25126


In [127]:
sentiment

array([ 1,  1,  1, ..., -1, -1, -1])

In [120]:
predicted = (predictions>0.5)*1

In [121]:
predicted

array([1, 0, 1, ..., 0, 1, 0])

In [134]:
correctly_predicted = ((sentiment>=1) == predicted).sum()

In [135]:
correctly_predicted

39903

In [136]:
accuracy = correctly_predicted/len(sentiment)

In [137]:
print(accuracy)

0.7518653904130238


In [159]:
print('correctly predicted : {} \nTotal numbers: {} \naccuracy: {}'.format(correctly_predicted, len(sentiment),\
                                                                                  accuracy))

correctly predicted : 39903 
Total numbers: 53072 
accuracy: 0.7518653904130238
