In [1]:
import nltk                                  # Python library for NLP
from nltk.corpus import twitter_samples      # sample Twitter dataset from NLTK
import numpy as np
import pandas as pd
import re                                  
import string   
import random
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
from sklearn.linear_model import LogisticRegression

The data we will use for this project is the sample twitter dataset from NLTK

In [2]:
# select the lists of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [3]:
# example of positive and negative tweets
print('\033[92m' + "Positive Tweet:\n" + all_positive_tweets[random.randint(0,5000)])
print('\033[91m' + "Negative Tweet:\n" + all_negative_tweets[random.randint(0,5000)])

[92mPositive Tweet:
For most of you it is #GoodMorning but for me it is #GoodNight...
#sleeptight for me and #haveagoodday for you!  :)
[91mNegative Tweet:
@WforWoman 
A9.
It would be Ice Cream without Ice :((
#WSaleLove


In [4]:
# concatenate the lists, 1st part is the positive tweets followed by the negative
tweets = all_positive_tweets + all_negative_tweets

# making labels of the tweets
labels_arr = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets))))
labels = np.squeeze(labels_arr).tolist()

## Preprocess Tweets

The tweets needs to be preprocessed before going further. For NLP, the preprocessing steps are comprised of the following tasks:

* Removing hyperlinks, hashtags and twitter markers
* Tokenizing the string
* Lowercasing
* Removing stop words and punctuation
* Stemming

In [5]:
# function to preprocess and tokenize a tweet
def preprocess_tweet(tweet):
    """
    Function to preprocess the tweets for sentiment analysis.
    Input: 
        tweet - A string of tweet
    Output: 
        tweet_clean - A list containing string stemmed words from the tweet
    """
    ## STEP 1 - Remove hyperlinks, Twitter marks and styles
    tweet_sub = re.sub(r'^RT[\s]+', '', tweet)                  # remove old style retweet text "RT"
    tweet_sub = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet_sub)  # remove hyperlinks
    tweet_sub = re.sub(r'#', '', tweet_sub)                     # remove hashtags, only removing the hash # sign
    tweet_sub = re.sub(r'\$\w*', '', tweet_sub)                 # remove stock market tickers like $GE
   
    ## STEP 2 - Tokenize the string and convert to lowercase and remove handles
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet_sub)
    
    ## STEP 3 - Remove stop words and punctuations
    ## STEP 4 - Stemming
    stopwords_english = stopwords.words('english') 
    stemmer = PorterStemmer() 
    
    tweet_clean = []
    
    for word in tweet_tokens:
        if (word not in stopwords_english and   # remove stopwords
            word not in string.punctuation):    # remove punctuation
            stem_word = stemmer.stem(word)      # stemming word
            tweet_clean.append(stem_word)
            
    return tweet_clean

In [6]:
# testing the function preprocess_tweet
tweet_num = random.randint(0,5000)
print('\033[92m' + "Tweet:\n" + tweets[tweet_num])
print('\033[94m' + "Preprocessed tweet:")
print(preprocess_tweet(tweets[tweet_num]))

print("\n")

tweet_num = random.randint(5000,10000)
print('\033[92m' + "Tweet:\n"+tweets[tweet_num])
print('\033[94m' + "Preprocessed tweet:")
print(preprocess_tweet(tweets[tweet_num]))

[92mTweet:
@KatCrisp1 Thanks for taking time to tweet this Kat :)
[94mPreprocessed tweet:
['thank', 'take', 'time', 'tweet', 'kat', ':)']


[92mTweet:
A sad new for the animal kingdom :( http://t.co/I7N9cinihz
[94mPreprocessed tweet:
['sad', 'new', 'anim', 'kingdom', ':(']


## Word frequency dictionary

Next we need to create a dictionary containing the word frequencies of all the words in the tokenized tweets and how much they occur is positive and negative tweets.

In [7]:
# function to build the freqency dictionary
def build_freq_dict(tweets, labels):
    """
    Function to build the frequency dictionary.
    Input:
        tweets - A list of all tweets
        labels - A list of labels (positive/negative) for the tweets
    Output:
        freq_dict - A dictionary with (word, label) as key and its frequency as the value
    """
    freq_dict = {}
    
    for tweet, y in zip(tweets, labels):
        for word in preprocess_tweet(tweet):
            key = (word,y)
            freq_dict[key] = freq_dict.get(key, 0) + 1
            
    return freq_dict

In [8]:
# testing the build_freq_dict function
fr = build_freq_dict(tweets, labels)
print(fr[('happi',1)])

211


### Table of word counts

We will build a vocabulary and then create a table which containes the word and its positive and negative frequencies for better vizualization.

In [9]:
# building the vocabulary
vocab = []
for tweet in tweets:
        for word in preprocess_tweet(tweet):
            if word not in vocab:
                vocab.append(word)

# building list representing table of word counts.
# each element consist of a sublist with this pattern: [<word>, <positive_count>, <negative_count>]
freq_table = []
for word in vocab:
    pos = 0
    neg = 0
    
    # retrieve number of positive counts
    if (word, 1) in fr:
        pos = fr[(word, 1)]  
    # retrieve number of negative counts
    elif (word, 0) in fr:
        neg = fr[(word, 0)]
    # append the word counts to the table
    freq_table.append([word, pos, neg])

In [10]:
freq_table[0:10]

[['followfriday', 25, 0],
 ['top', 32, 0],
 ['engag', 7, 0],
 ['member', 16, 0],
 ['commun', 33, 0],
 ['week', 83, 0],
 [':)', 3568, 0],
 ['hey', 76, 0],
 ['jame', 7, 0],
 ['odd', 2, 0]]

## Logistic Regression

In [11]:
# split the data into training and testing sets
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [12]:
# labels for training and testing datasets
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [13]:
# convert array to list
labels_train_y = np.squeeze(train_y).tolist()

In [14]:
# build frequency dictionary just from the training data set
freqs_dict = build_freq_dict(train_x, labels_train_y)

In [15]:
# function to extract features of a tweet using the frequency dictionary
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # tokenize the tweet
    words = preprocess_tweet(tweet)
    
    #initialize the feature vector and set the bias term to 1
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    
    for word in words:
        x[0,1] += freqs.get((word,1.0), 0)      # increment the word count for the positive label 1
        x[0,2] += freqs.get((word,0.0), 0)      # increment the word count for the negative label 0
        
    assert(x.shape == (1, 3))
    return x

In [16]:
# extract the features of train_x
train_x_features = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    train_x_features[i, :]= extract_features(train_x[i], freqs_dict)

In [17]:
# extract the features of test_x
test_x_features = np.zeros((len(test_x), 3))
for i in range(len(test_x)):
    test_x_features[i, :]= extract_features(test_x[i], freqs_dict)

In [18]:
train_x_features

array([[1.000e+00, 3.020e+03, 6.100e+01],
       [1.000e+00, 3.573e+03, 4.440e+02],
       [1.000e+00, 3.005e+03, 1.150e+02],
       ...,
       [1.000e+00, 1.440e+02, 7.830e+02],
       [1.000e+00, 2.050e+02, 3.890e+03],
       [1.000e+00, 1.890e+02, 3.974e+03]])

In [19]:
train_y_features = np.squeeze(train_y)
test_y_features = np.squeeze(test_y)

In [20]:
# Training the Logistic Regression Model
LogReg = LogisticRegression(random_state=0)
LogReg.fit(train_x_features,train_y_features)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
# predicting the labels of the test tweets
LogReg.predict(test_x_features)

array([1., 1., 1., ..., 0., 0., 0.])

In [22]:
# accuracy of prediction
LogReg.score(test_x_features, test_y_features)

0.9915

We got an accuracy of 99.15% from Logistic Regression. Pretty good!

## Predicting my own tweet using the Logistic Regression model

In [23]:
my_tweet = 'i love the sunshine outside today @gunman'
print(preprocess_tweet(my_tweet))

x = extract_features(my_tweet, freqs_dict)

y_predict = LogReg.predict(x)
print(y_predict)

if y_predict == 1:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['love', 'sunshin', 'outsid', 'today']
[1.]
Positive sentiment


## Naive Bayes

Following steps are required to implement a naive bayes classifier:

#### $F_{pos}$ and $F_{neg}$
- The positive and negative frequency of each word in the tweets using the frequency dictionary

#### Calculate V
- V is the number of unique words which appear in the frequency dictionary

#### Calculate $N_{pos}$ and $N_{neg}$
- Calculate the total number of positive words and total number of negative words

#### Calculate logprior
- Compute logprior using $log(D_{pos}) - log(D_{neg})$ where $D_{pos}$ is total number of positive tweets in the training dataset and $D_{neg}$ is total number of negative tweets in the training dataset

#### Calculate the conditional probabilities p_w_pos and p_w_neg
- Compute the conditional probabilities of each word in each category using the formula:
$$ P(W/{pos}) = \frac{F_{pos} + 1}{N_{pos} + V} $$
$$ P(W/{neg}) = \frac{F_{neg} + 1}{N_{neg} + V} $$

#### Calculate log likelihood
- Compute log likelihood using $log \left( \frac{P(W/{pos})}{P(W/{neg})} \right)\$

In [24]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets
    Output:
        logprior: the log prior
        loglikelihood: the log likelihood of you Naive bayes equation
    '''
    loglikelihood = {}
    logprior = 0

    # calculate V
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:            # if the label is positive
            N_pos += freqs[pair]
        else:                      # else, the label is negative
            N_neg += freqs[pair]

    # number of total documents, positive documents and negative documents
    D = len(train_y)
    D_pos = np.sum(train_y)
    D_neg = D - D_pos

    # logprior
    logprior = np.log(D_pos/D_neg)

    # loglikelihood
    for word in vocab:
        # get the positive and negative frequency of the word
        f_pos = freqs.get((word,1),0)
        f_neg = freqs.get((word,0),0)

        # conditional probabilities
        p_w_pos = (f_pos + 1) / (N_pos + V)
        p_w_neg = (f_neg + 1) / (N_neg + V)

        # log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)

    return logprior, loglikelihood

In [25]:
# training the naive bayes classifier using train_x
logprior, loglikelihood = train_naive_bayes(freqs_dict, train_x, np.squeeze(train_y))

In [26]:
print(logprior)
print(len(loglikelihood))

0.0
9089


We can predict with the Naive Bayes classifier using the logprior and loglikelihood with the below formula:
$$ p = logprior + \sum_i^N (loglikelihood_i)$$

In [27]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: prediction

    '''
    # process the tweet to get a list of words
    word_l = preprocess_tweet(tweet)

    # initialize probability to logprior
    p = logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]     # add the log likelihood of that word to the probability
    
    return p

If the prediction p from the above function is positive, the tweet is predicted to have positive sentiment.
Similarly, if the prediction p from the above function is negative, the tweet is predicted to have negative sentiment.

In [28]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: accuracy of prediction
    """
    accuracy = 0

    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:  # if the prediction is > 0
            y_hat_i = 1                                              # the predicted class is 1
        else:
            y_hat_i = 0                                              # otherwise the predicted class is 0

        y_hats.append(y_hat_i)

    # Error
    error = np.mean(np.abs(np.array([y_hats]) - test_y))

    # Accuracy
    accuracy = 1 - error
    
    return accuracy

In [29]:
# testing naive bayes classifier using test_x
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, np.squeeze(test_y), logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


We got an accuracy of 99.4% from Naive Bayes Classifier. Pretty good!

### Predicting my own tweet using the Naive Bayes Classifier

In [30]:
my_tweet = 'you are bad :('
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)
if p > 0:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

-8.801622640492191
Negative sentiment
