In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import math
%matplotlib inline

In [2]:
products=pd.read_csv('data_set/amazon_baby_subset.csv')
products.columns

Index(['name', 'review', 'rating', 'sentiment'], dtype='object')

In [3]:
products.head(10)

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1
5,Our Baby Girl Memory Book,"Beautiful book, I love it to record cherished ...",5,1
6,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,"Try this out for a spring project !Easy ,fun a...",5,1
7,Blessed By Pope Benedict XVI Divine Mercy Full...,very nice Divine Mercy Pendant of Jesus now on...,5,1
8,Cloth Diaper Pins Stainless Steel Traditional ...,We bought the pins as my 6 year old Autistic s...,4,1
9,Cloth Diaper Pins Stainless Steel Traditional ...,It has been many years since we needed diaper ...,5,1


In [4]:
products.groupby('sentiment').size()

sentiment
-1    26493
 1    26579
dtype: int64

## Apply text cleaning on raw data

In [5]:
#Reading json file
important_words=list(pd.read_json('data_set/important_words.json')[0].values)
len(important_words)

193

1.
<br>
Let us perform 2 simple data transformations:

Remove punctuation<br>
Compute word counts (only for important_words)

In [6]:
products=products.fillna({'review':''})

In [7]:
def remove_punctuation(text):
    return text.translate(text.maketrans('','',string.punctuation))

In [8]:
products['review_clean']=products['review'].apply(remove_punctuation)

In [9]:
for word in important_words:
    products[word]=products['review_clean'].apply(lambda s:s.split().count(word))

In [10]:
products.columns

Index(['name', 'review', 'rating', 'sentiment', 'review_clean', 'baby', 'one',
       'great', 'love', 'use',
       ...
       'seems', 'picture', 'completely', 'wish', 'buying', 'babies', 'won',
       'tub', 'almost', 'either'],
      dtype='object', length=198)

### Quiz Question. How many reviews contain the word perfect?

In [11]:
products[products['perfect']!=0].shape[0]

2955

In [12]:
def get_numpy_data(data_set,features,output):
    data_set['constant']=1
    features=['constant']+features
    feature_matrix=np.array(data_set[features])
    output_matrix=np.array(data_set[output])
    return feature_matrix,output_matrix

In [13]:
feature_matrix,sentiment=get_numpy_data(products,important_words,'sentiment')

### Quiz Question: How many features are there in the feature_matrix?

In [14]:
feature_matrix.shape

(53072, 194)

## Estimating Conditional probability with link function

In [15]:
def predict_probability(feature_matrix,coefficients):
    predictions=[]
    score=np.dot(feature_matrix,coefficients)
    for s in score:
        p=1/(1+math.exp(-s))
        predictions.append(p)
    predictions=np.array(predictions)
    return predictions

In [16]:
feature_matrix.shape

(53072, 194)

## Partial derivative wrt to coefficients

In [17]:
def feature_derivative(errors,feature):
    derivative=np.dot(errors,feature)
    return derivative

## Note :
 In the main lecture, our focus was on the likelihood. In the advanced optional video, however, we introduced a transformation of this likelihood---called the log-likelihood---that simplifies the derivation of the gradient and is more numerically stable. Due to its numerical stability, we will use the log-likelihood instead of the likelihood to assess the algorithm.

In [18]:
def compute_log_likelihood(feature_matrix,sentiment,coefficients):
    indicator=(sentiment==+1)
    scores=np.dot(feature_matrix,coefficients)
    lp=np.sum((indicator-1)*scores - np.log(1.+np.exp(-scores)))
    return lp

In [19]:
def logistic_regression(feature_matrix,sentiment,initial_coefficients,step_size,max_iter):
    coefficients=np.array(initial_coefficients)
    for itr in range(max_iter):
        predictions=predict_probability(feature_matrix,coefficients)
        ## Compute indicator i.e. y_i=+1
        indicator=(sentiment==+1)
        errors=indicator - predictions
        for j in range(len(coefficients)):
            derivative=feature_derivative(errors,feature_matrix[:,j])
            coefficients[j]+=step_size*derivative
            
        #Checking whether log likelihood is increasing
        if itr<=15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or  (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp=compute_log_likelihood(feature_matrix,sentiment,coefficients)
            print('Iteration %d: log likelihood of observed labels = %.8f'%(itr,lp))
        
    return coefficients

In [20]:
initial_coefficients=np.zeros(194)
step_size=1e-7
max_iter=301

### Quiz question: As each iteration of gradient ascent passes, does the log likelihood increase or decrease?

In [21]:
coefficients=logistic_regression(feature_matrix,sentiment,initial_coefficients,step_size,max_iter)

Iteration 0: log likelihood of observed labels = -36780.91768478
Iteration 1: log likelihood of observed labels = -36775.13434712
Iteration 2: log likelihood of observed labels = -36769.35713564
Iteration 3: log likelihood of observed labels = -36763.58603240
Iteration 4: log likelihood of observed labels = -36757.82101962
Iteration 5: log likelihood of observed labels = -36752.06207964
Iteration 6: log likelihood of observed labels = -36746.30919497
Iteration 7: log likelihood of observed labels = -36740.56234821
Iteration 8: log likelihood of observed labels = -36734.82152213
Iteration 9: log likelihood of observed labels = -36729.08669961
Iteration 10: log likelihood of observed labels = -36723.35786366
Iteration 11: log likelihood of observed labels = -36717.63499744
Iteration 12: log likelihood of observed labels = -36711.91808422
Iteration 13: log likelihood of observed labels = -36706.20710739
Iteration 14: log likelihood of observed labels = -36700.50205049
Iteration 15: log li

In [22]:
scores=np.dot(feature_matrix,coefficients)

In [23]:
predict_result=[]
for score in scores:
    if score>0:
        predict_result.append(+1)
    else:
        predict_result.append(-1)


### Quiz question: How many reviews were predicted to have positive sentiment?

In [24]:
count=0
for p in predict_result:
    if p == 1:
        count+=1
count

25126

## Measuring Accuracy

In [28]:
predict_result=np.array(predict_result)

In [47]:
def accuracy_measure(predictions,outcomes):
    misclassified=predictions - outcomes
    total_points=predictions.shape[0]
    n_misclassified=len(misclassified.nonzero()[0])
    accuracy = (total_points - n_misclassified)/total_points
    return accuracy

### Quiz question: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)

In [48]:
accuracy_measure(predict_result,sentiment)

0.7518653904130238

## Which words contribute most to positive and negative sentiments

In [52]:
coefficients=list(coefficients[1:])
word_coefficient_tuples=[(word,coefficient) for word,coefficient in zip(important_words,coefficients)]
word_coefficient_tuples=sorted(word_coefficient_tuples,key=lambda x:x[1],reverse=True)

### Quiz question: Which word is not present in the top 10 "most positive" words?

In [55]:
word_coefficient_tuples[0:10]

[('great', 0.0665460841704577),
 ('love', 0.06589076292212326),
 ('easy', 0.06479458680257838),
 ('little', 0.04543562630842138),
 ('loves', 0.04497640139490604),
 ('well', 0.030135001092107067),
 ('perfect', 0.02973993710496846),
 ('old', 0.02007754103477538),
 ('nice', 0.018408707995268992),
 ('daughter', 0.017703199905701694)]

### Quiz question: Which word is not present in the top 10 "most negative" words?

In [57]:
word_coefficient_tuples[-10:]

[('monitor', -0.02448210054589172),
 ('return', -0.026592778462247283),
 ('back', -0.027742697230661334),
 ('get', -0.028711552980192585),
 ('disappointed', -0.028978976142317068),
 ('even', -0.030051249236035808),
 ('work', -0.03306951529475272),
 ('money', -0.03898203728648711),
 ('product', -0.04151103339210889),
 ('would', -0.05386014844520313)]

0.015565696580423508