# Sentiment Analysis using Logistic regression

In [1]:
#Import the libraries
import nltk
from os import getcwd
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [2]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
from utils import process_tweet, build_freqs

In [3]:
#Get the data
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
#Split for training and testing
pos_train = positive_tweets[:4000]
pos_test = positive_tweets[4000:]
neg_train = negative_tweets[:4000]
neg_test = negative_tweets[4000:]

X_train = pos_train + neg_train
X_test = pos_test + neg_test

In [5]:
#Create labels for training and testing
y_train = np.append(np.ones((len(pos_train), 1)), np.zeros((len(neg_train), 1)), axis = 0)
y_test = np.append(np.ones((len(pos_test), 1)), np.zeros((len(neg_test), 1)), axis = 0)

In [6]:
print("Y train shape: ", y_train.shape)
print("Y test shape: ", y_test.shape)

Y train shape:  (8000, 1)
Y test shape:  (2000, 1)


In [7]:
#Create the frequency table
freqs = build_freqs(X_train, y_train)
len(freqs)

11346

In [8]:
#Process tweets
print("Sample tweet: ", X_train[0])
print("Processed tweet: ", process_tweet(X_train[0]))

Sample tweet:  #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Processed tweet:  ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


# Logistic Regression

In [9]:
#Define the sigmoid function
def sigmoid(z):
    h = 1.0 / (1 + np.exp(-z))
    return h

In [10]:
#Test the sigmoid function
if sigmoid(0) == 0.5:
    print("Correct")
else:
    print("Incorrect")

Correct


In [11]:
#Function for gradient descent

def gradient_descent(x, y, theta, alpha, num_iters):
    
    #Get the length of matrix
    m = len(x)
    
    for i in range(num_iters):
        
        #Get z
        z = np.dot(x, theta)
        
        #Get the sigmoid
        h = sigmoid(z)
        
        #Calculate the cost function
        J = (-1.0 / m) * (np.dot(y.T, np.log(h)) + np.dot((1- y).T, np.log(1 - h)))
        
        #Update the weights theta
        theta -= (alpha / m) * np.dot(x.T, (h - y))
        
    J = float(J)
    return J, theta

In [12]:
#Check the gradient descent
np.random.seed(1)

#Input and output
X_ = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis = 1)
y_ = (np.random.rand(10, 1) > 0.35).astype(float)

#Apply the gradient descent
tmp_J, tmp_theta = gradient_descent(X_, y_, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


In [13]:
#Extract the features
def extract_features(tweet, freqs):
    
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0), 0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0), 0)
        
    assert(x.shape == (1, 3))
    return x

In [14]:
# Check your function

tmp1 = extract_features(X_train[0], freqs)
print(tmp1)
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1.00e+00 3.02e+03 6.10e+01]]
[[1. 0. 0.]]


In [15]:
#Training the model

# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)

# training labels corresponding to X
Y = y_train

# Apply gradient descent
J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24216529.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


In [16]:
#Test the model
def predict_tweet(tweet, freqs, theta):
    
    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta))
    
    return y_pred

In [17]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518580
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530898
great great great -> 0.546273
great great great great -> 0.561561


In [18]:
#Check the performance using test set
def test_logistic_regression(test_x, test_y, freqs, theta):
    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1.0)
        else:
            # append 0 to the list
            y_hat.append(0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    accuracy = (y_hat == np.squeeze(test_y)).sum() / len(test_y)

    
    return accuracy

In [19]:
tmp_accuracy = test_logistic_regression(X_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [21]:
#Error analysis
print('Label Predicted Tweet')
for x,y in zip(X_test,y_test):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Label Predicted Tweet
THE TWEET IS: @jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
THE PROCESSED TWEET IS: ['truli', 'later', 'move', 'know', 'queen', 'bee', 'upward', 'bound', 'movingonup']
1	0.49996890	b'truli later move know queen bee upward bound movingonup'
THE TWEET IS: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
THE PROCESSED TWEET IS: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48622857	b'sure would good thing 4 bottom dare 2 say 2 miss b im gonna stubborn mouth soap nothavingit :p'
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370665	b"i'm play brain dot braindot"
THE TWEET IS: I'm p

In [23]:
#Predict for own tweet
my_tweet = "This is too good to be true."
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print("Positive Sentiment")
else:
    print("Negative Sentiment")

['good', 'true']
[[0.51334465]]
Positive Sentiment


In [24]:
"""As we can see the sample tweet, the logisitic regression can not understand slangs, sarcasm or positions of the words. 
It just consider all the words regardless of their order and predict them."""

'As we can see the sample tweet, the logisitic regression can not understand slangs, sarcasm or positions of the words. \nIt just consider all the words regardless of their order and predict them.'