# Homework 2

Complete modeling from scratch

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read data sets
products = pd.read_csv('./amazon_baby_subset.csv')
#important_words = pd.read_json('./important_words.json')

In [3]:
products['name'].head(10)

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [4]:
# ** How to use json files:
import json
with open('important_words.json', 'r') as f: # Read the list of most frequent words
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

### 1. Convert text into numerical data

In [5]:
# Clean the raw data
products = products.fillna({'review': ''}) # fill in 'N/A' in the review column

In [6]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [8]:
# Count words from scratch
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s: s.split().count(word))

In [9]:
products['contains_perfect'] = [0 if elm == 0 else 1 for elm in products['perfect']]

In [10]:
sum(products['contains_perfect']) # the number of 'perfect' in the reviews

2955

In [11]:
# Convert DataFrames to matrixes that I will use in the model
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()
    return (feature_matrix, label_array)

In [12]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

In [72]:
feature_matrix.shape # 193 features and a constant

(53072, 194)

In [14]:
type(feature_matrix) # A matrix as well as a 2D array

numpy.ndarray

### 2. Construct the Logistic Regression model

In [15]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1.0 / (1 + np.exp(-1*score))
    
    # return predictions
    return predictions

In [16]:
# Check the function predict_probability 
dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 
                               3., 
                               -1.])

correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),          1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_predictions = np.array( [ 1./(1+np.exp(-correct_scores[0])), 1./(1+np.exp(-correct_scores[1])) ] )

print 'The following outputs must match '
print '------------------------------------------------'
print 'correct_predictions           =', correct_predictions
print 'output of predict_probability =', predict_probability(dummy_feature_matrix, dummy_coefficients)

The following outputs must match 
------------------------------------------------
correct_predictions           = [ 0.98201379  0.26894142]
output of predict_probability = [ 0.98201379  0.26894142]


In [17]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
    # Return the derivative
    return derivative

In [18]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [20]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            derivative = feature_derivative(errors, feature_matrix[:, j])

            # add the step size times the derivative to the current coefficient
            coefficients[j] += step_size * derivative

        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

In [21]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients = np.zeros(194), step_size = 1e-7, max_iter = 301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

### 3. Predicting 

In [22]:
# Predicting sentiments
scores = np.dot(feature_matrix, coefficients)

In [30]:
# Classify sentiments with threshold value 0 in scores
class_predictions = pd.Series(scores).apply(lambda x: 1 if x > 0 else -1) # '.apply()' is only available to pd.DataFrame
type(class_predictions)

pandas.core.series.Series

In [38]:
# How many reviews are predicted to have positive sentiment? count the number of 1 & -1, the function 'table' in R
unique, counts = np.unique(class_predictions, return_counts = True)
print unique, counts

[-1  1] [27946 25126]


### 4. Measuring accuracy

In [49]:
print type(sentiment)
print type(class_predictions)

<type 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [52]:
num_correct = sum(sentiment == class_predictions)
total_data = len(sentiment)
accuracy = num_correct / float(total_data)
print 'Sentiments correctly classified: %d' % num_correct
print 'Total sentiments: %d' % total_data
print '---------------------------------------------'
print 'Accuracy: %.2f' % accuracy

Sentiments correctly classified: 39903
Total sentiments: 53072
---------------------------------------------
Accuracy: 0.75


### 5. Which words contribute most to positive & negative sentiments

In [63]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = zip(important_words, coefficients)
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True) # sort all tuples in descending order

In [65]:
word_coefficient_tuples[:10] # the top 10 'most positive' words

[('one', 0.066546084170457695),
 ('great', 0.065890762922123244),
 ('like', 0.064794586802578394),
 ('easy', 0.045435626308421372),
 ('much', 0.044976401394906038),
 ('old', 0.03013500109210707),
 ('even', 0.029739937104968459),
 ('seat', 0.020077541034775381),
 ('perfect', 0.018408707995268992),
 ('good', 0.01770319990570169)]

In [68]:
word_coefficient_tuples[-10:] # the top 10 'most negative' words

[('money', -0.02448210054589172),
 ('waste', -0.026592778462247283),
 ('still', -0.027742697230661327),
 ('well', -0.028711552980192581),
 ('however', -0.028978976142317068),
 ('first', -0.030051249236035804),
 ('bottles', -0.03306951529475273),
 ('day', -0.038982037286487116),
 ('bought', -0.041511033392108897),
 ('use', -0.053860148445203128)]