In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("../course_3_data/amazon_baby_subset.csv")
df = df.fillna({'review':''}) 

# Removing Punctuations

In [3]:
def remove_punctuation(text):
    import string
    translation = text.maketrans("","",string.punctuation)

    return text.translate(translation) 

df['review_clean'] = df['review'].apply(remove_punctuation)

# Importing important words

In [4]:
important_words=pd.read_json("../course_3_data/important_words.json")
important_words=important_words[0]
important_words

0        baby
1         one
2       great
3        love
4         use
        ...  
188    babies
189       won
190       tub
191    almost
192    either
Name: 0, Length: 193, dtype: object

In [6]:
for word in important_words:
    df[word] = df['review_clean'].apply(lambda s : s.split().count(word))


In [7]:
df['contains_perfect']=df['perfect'].apply(lambda x:1 if x>=1 else 0)

In [8]:
df['contains_perfect'].sum()

2955

In [9]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = features.insert(0,'constant')
    features_frame = dataframe[features]
    feature_matrix = features_frame.to_numpy()
    label_sarray = dataframe[label]
    label_array = label_sarray.to_numpy()
    return(feature_matrix, label_array)

In [10]:
features=df.columns[5:-2]
label='sentiment'
feature_matrix ,label_array=get_numpy_data(df,features,label)


In [11]:
feature_matrix.shape

(53072, 193)

In [12]:
import math
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix,coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions=[]
    for i in score:
        predictions.append(1/(1+math.exp(-i)))
    
    # return predictions
    return predictions

In [13]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors,feature)
        # Return the derivative
    return derivative

In [14]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [15]:
from math import sqrt,exp
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix,coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in range(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors,feature_matrix[:,j])

            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j]=coefficients[j]+step_size*derivative

        # Checking whether log likelihood is increasing
#         if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
#         or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
#             lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
#             print('iteration %*d: log likelihood of observed labels = %.8f' % \
#                 (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [16]:
initial_coefficients = np.zeros(feature_matrix.shape[1])
step_size = 1e-7
max_iter = 301
sentiment=label_array

In [17]:
coefficients=logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

In [18]:
def predict_sentiment(feature_matrix,coefficients):
    predictions=[]
    scores=np.dot(feature_matrix,coefficients)
    for i in scores:
        if i>0:
            predictions.append(1)
        else:
            predictions.append(-1)
    return predictions
    

In [19]:
predictions=predict_sentiment(feature_matrix,coefficients)

In [20]:
len(predictions)

53072

In [21]:
def accuracy(df,predictions):
    true_predictions = df[df['sentiment'] == predictions]
    accuracy=len(true_predictions)/len(df)
    return accuracy

In [22]:
accuracy(df,predictions)

0.7516015978293639

In [23]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples_neg = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)

word_coefficient_tuples_neg[:11]

[('would', -0.0538786087871293),
 ('product', -0.04151769226927977),
 ('money', -0.03898507885996874),
 ('work', -0.033077842770631034),
 ('even', -0.030060980498469735),
 ('disappointed', -0.0289804152407098),
 ('get', -0.028726953103850834),
 ('back', -0.02775326994852898),
 ('return', -0.026594878369219397),
 ('monitor', -0.024489813382533455),
 ('waste', -0.024043447860526233)]

In [24]:
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
word_coefficient_tuples[:11]

[('great', 0.06653933724512921),
 ('love', 0.06588699426244836),
 ('easy', 0.06478847856139208),
 ('little', 0.04542728408710588),
 ('loves', 0.0449751555808955),
 ('well', 0.030126660665315767),
 ('perfect', 0.029738104690617856),
 ('old', 0.020070748688667327),
 ('nice', 0.018405027621086652),
 ('daughter', 0.017699039676574584),
 ('soft', 0.017568191870928618)]