In [12]:
import nltk 
from os import getcwd

In [13]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [14]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 

from utils import process_tweet, build_freqs

In [15]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [16]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]

In [17]:
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]


In [18]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg


In [19]:
train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis=0)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis=0)
print("train_y.shape= "+ str(train_y.shape))
print("test_y.shape= "+ str(test_y.shape))

train_y.shape= (8000, 1)
test_y.shape= (2000, 1)


In [20]:
freqs = build_freqs(train_x, train_y)

In [21]:
print(train_x[0])
print(process_tweet(train_x[0]))

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [22]:
def sigmoid(z):
    h = 1/(1+np.exp(-z))
    return h

In [23]:
if (sigmoid(0)==0.5):
    print("Succes!")
else:
    print("Oops")


Succes!


In [34]:
def gradientDescent(x,y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0,num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
        J = -1./m *(np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(), np.log(1-h)))
        theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
    J = float(J)
    return J, theta


In [35]:
def extract_features(tweet, freqs):
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0,0] = 1
    for word in word_l:
        x[0, 1] +=freqs.get((word, 1.0),0)
        x[0, 2] +=freqs.get((word, 0.0),0)
        
    assert(x.shape==(1,3))
    return x

In [36]:
tmp = extract_features(train_x[0],freqs)
print(tmp)

[[1.00e+00 3.02e+03 6.10e+01]]


In [37]:
X = np.zeros((len(train_x),3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i],freqs)
    
Y = train_y
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")


The cost after training is 0.24216529.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


In [38]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x, theta))
    return y_pred

In [39]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great great']:
    print('%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518580
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great great -> 0.530898


In [40]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred >0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy

In [41]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [42]:
# Some error analysis done for you
print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Label Predicted Tweet
THE TWEET IS: @jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
THE PROCESSED TWEET IS: ['truli', 'later', 'move', 'know', 'queen', 'bee', 'upward', 'bound', 'movingonup']
1	0.49996890	b'truli later move know queen bee upward bound movingonup'
THE TWEET IS: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
THE PROCESSED TWEET IS: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48622857	b'sure would good thing 4 bottom dare 2 say 2 miss b im gonna stubborn mouth soap nothavingit :p'
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370665	b"i'm play brain dot braindot"
THE TWEET IS: I'm p

In [53]:
my_tweet = 'It was so scary I could not breathe'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['scari', 'could', 'breath']
[[0.49533206]]
Negative sentiment
