In [1]:
import warnings
warnings.filterwarnings("ignore")

import nltk
from os import getcwd
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import pandas as pd
from nltk.corpus import twitter_samples 

### Downoald data to be available

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Defining two functions:
1. `process_tweet`: For preprocessing the tween and removing unnecessary parts.
2. `build_freq`: To get the list of all sentences and array of their corresponding sentiment labels and return a dict of this format: `{(word, label): count}`

In [3]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [4]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [5]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

### Splitting the data into train and test

In [6]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [7]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


### Building word frequencies

In [8]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11346


In [9]:
# test the function below
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


### Logistic Regression

In [10]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    
    h = (1/(1 + np.exp(-z)))
    
    return h

In [11]:
# Testing your function 
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

if (sigmoid(4.92) == 0.9927537604041685):
    print('CORRECT!')
else:
    print('Oops again!')

SUCCESS!
CORRECT!


### Gradient Descent

In [12]:
def gradient_descent(X, y, theta, lr, epochs):
    m = y.shape[0]
    print(X.shape, theta.shape, y.shape)
    for i in range(epochs):
        z = np.dot(X, theta)
        y_hat = sigmoid(z)
        cost = - (1/m) * np.sum(np.dot(y.T, np.log(y_hat) + np.dot((1 - y).T, np.log(1 - y_hat))))
        theta = theta - (lr/m) * np.dot(X.T, (y_hat - y))
    return theta, float(cost)

In [13]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_theta, tmp_cost = gradient_descent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_cost:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

(10, 3) (3, 1) (10, 1)
The cost after training is 2.29483927.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


### Extract tweet freatures

In [14]:
def extract_features(tweet, freqs):
    all = process_tweet(tweet)
    
    x = np.zeros((1, 3))
    # bias
    x[0, 0] = 1
    
    # feature 1: positive summation
    x[0, 1] = sum([freqs.get((w, 1), 0) for w in all])
    
    # feature 2: negative summation
    x[0, 2] = sum([freqs.get((w, 0), 0) for w in all])
    
    assert(x.shape == (1, 3))
    
    return x
    

In [15]:
extract_features(train_x[0], freqs)

array([[1.00e+00, 3.02e+03, 6.10e+01]])

### Train the Model

In [16]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

# Apply gradient descent
theta, cost = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {cost:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

(8000, 3) (3, 1) (8000, 1)
The cost after training is 345.48699932.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


### Predict on Test 

In [17]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet, freqs)
    
    z = np.dot(x, theta)
    y_hat = sigmoid(z)
    
    return int(y_hat > 0.5)

In [18]:
random_idx = np.random.randint(0, len(test_x), size=10)
for j in random_idx:
    print(test_x[j])
    pred = predict_tweet(test_x[j], freqs, theta)
    print(f"Predicted: {pred} | True: {test_y[j][0]}\n")

@Garfyville In Bath? :) Even we don’t have wheelie bins, nowhere to put them @paulmutton
Predicted: 1 | True: 1.0

I'm so frustrated with my planks. The times are super inconsistent, and I have no idea what I'm doing wrong. :(
Predicted: 0 | True: 0.0

@Glanny_ @soL_Lyah @_wattie I told him to suicide and kill both of you, but you ran away forever :(
Predicted: 0 | True: 0.0

why won't justin come to Scotland :(
Predicted: 0 | True: 0.0

I've run out of bread and I don't feel well enough to make any. :(
Predicted: 0 | True: 0.0

I miss those convo's so bad damn :(
Predicted: 0 | True: 0.0

@deano042 @RealKrisTravis WHAT :(
Predicted: 0 | True: 0.0

@anime_narutoINA thank you min :)
Predicted: 1 | True: 1.0

@namcew make use of the masquerade mask!!!! Lol #zorroreturms :-)
Predicted: 1 | True: 1.0

@KylieDeVille Happy Birthday!! Enjoy your vacation!:)
Predicted: 1 | True: 1.0



### Evaluation

In [19]:
X_test = np.zeros((len(test_x), 3))
for i in range(len(test_x)):
    X_test[i, :]= extract_features(test_x[i], freqs)

In [20]:
def model(X, theta):
    z = np.dot(X, theta)
    y_hat = sigmoid(z)
    
    return y_hat

In [21]:
y_pred_proba = model(X_test, theta)
y_pred = (y_pred_proba > 0.5)

print(f"Test Accuracy: {(y_pred == test_y).mean()}")

Test Accuracy: 0.995


### My Example

In [22]:
predict_tweet("@saeid I think today was good for me, beause I've earned 100$.", freqs, theta)

1