In [1]:
import nltk
from os import getcwd

In [2]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [4]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [9]:
# split the data into two pieces, one for training and one for testing (validation set) 
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

X_train = train_pos + train_neg 
X_test= test_pos + test_neg

y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [10]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopword_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    #tweet tokenizer
    tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    tweet_token = tokenizer.tokenize(tweet)
    
    tweet_clean = []
    
    for word in tweet_token:
        if word not in stopword_english and word not in string.punctuation:
            stem_words = stemmer.stem(word)
            tweet_clean.append(stem_words)
    return tweet_clean

In [16]:
def word_count(tweets,ys):
    ys_list = np.squeeze(ys).tolist()
    freq = {}
    
    for y,tweet in zip(ys_list,tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in freq:
                freq[pair] += 1
            else:
                freq[pair] = 1
    return freq

In [17]:
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
word_count(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [18]:
freqs = word_count(X_train, y_train)

In [22]:
#Calculate no of unique word i.e V
for i in freqs.keys():
    print(i[0])
    break

followfriday


In [25]:
#Calculate no of unique word i.e V
unique_word = set([word[0] for word in freqs.keys()])
V = len(unique_word)
print(V)

9089


In [30]:
i

('followfriday', 1.0)

In [29]:
freqs[i]

23

In [48]:
#Calculate N_pos, N_neg
N_pos = N_neg = 0
for word_class_pair in freqs.keys():
    if word_class_pair[1] > 0:
        N_pos += freqs[word_class_pair]
    else:
        N_neg += freqs[word_class_pair]

In [49]:
print(N_pos,N_neg)

26845 27040


In [45]:
word_class_pair

('thursday', 0.0)

In [55]:
def lookup(freqs,word,label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n=0
    pair = (word,label)
    if (pair in freqs):
        n = freqs[pair]
    return n

In [97]:
def naive_bayes_train(freqs,X_train,y_train):
    #Calculate no of unique word i.e V
    loglikelihood = {}
    vocab=unique_word = set([word[0] for word in freqs.keys()])
    V = len(unique_word)
    #print(V)
    #Calculate N_pos, N_neg
    N_pos = N_neg = 0
    for word_class_pair in freqs.keys():
        if word_class_pair[1] > 0:
            N_pos += freqs[word_class_pair]
        else:
            N_neg += freqs[word_class_pair]
    #print(N_pos,N_neg)
    #for each word in vocabulary get the +ve and -ve freq of word
    for word in vocab:
        pos_freq = lookup(freqs,word,1)
        neg_freq = lookup(freqs,word,0)
        #print(pos_freq,neg_freq)
        #Calculate each word prob +ve and -ve class
        p_w_pos = (pos_freq+1)/(N_pos+V)
        p_w_neg = (neg_freq+1)/(N_neg+V)
        #Calculate log likelihood i.e lambda of each word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
    #Calculate log prior
    logprior = 0
    D = len(y_train)
    D_pos = len(list(filter(lambda x: x > 0, y_train)))
    D_neg = len(list(filter(lambda x: x <= 0, y_train)))
    logprior = np.log(D_pos/D_neg)
    
    return logprior,loglikelihood

In [98]:
len(list(filter(lambda x: x <= 0, y_train)))

4000

In [99]:
logprior,loglikelihood = naive_bayes_train(freqs, X_train, y_train)
print(logprior)
print(len(loglikelihood))

0.0
9089


In [100]:
def naive_bayes_predict(tweet,logprior,loglikelihood):
    word_1 = process_tweet(tweet)
    p = 0 
    p += logprior
    
    for word in word_1:
        if word in loglikelihood:
            p+= loglikelihood[word]
    return p

In [104]:
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5740278623499175


In [105]:
def naive_bayes_test(X_test,y_test,logprior,loglikelihood):
    y_hat = []
    accuracy = 0
    for tweet in X_test:
        if naive_bayes_predict(tweet,logprior,loglikelihood)>0:
            y_hat_i = 1
        else:
            y_hat_i = 0
        y_hat.append(y_hat_i)
    error = np.mean(np.absolute(y_hat-y_test))
    accuracy = 1 - error
    return accuracy

In [106]:
print("Naive Bayes accuracy = %0.4f" %(naive_bayes_test(X_test, y_test, logprior, loglikelihood)))

Naive Bayes accuracy = 0.5000


In [108]:
loglikelihood

{'bo': 0.6985591249960175,
 'dudafti': 0.6985591249960175,
 'said': -0.6231967149863019,
 'paramed': -0.6877352361238731,
 'read': 0.5650277323714948,
 'johnyherbert': 0.6985591249960175,
 'alhamdulillah': 0.6985591249960175,
 'geekiest': 0.6985591249960175,
 'salmon': 0.6985591249960175,
 'khawateen': 0.6985591249960175,
 'hive': -0.6877352361238731,
 'within': 0.9217026763102273,
 'timezon': 0.6985591249960175,
 'topgear': -0.6877352361238731,
 'phonegraph': 0.6985591249960175,
 '452096': 0.6985591249960175,
 'monsoon': 0.6985591249960175,
 'rbi': -0.6877352361238731,
 'nintendo': -1.0932003442320375,
 'louisiana': -1.3808824166838185,
 'qualiti': 0.6985591249960175,
 'disappear': -0.6877352361238731,
 "y'all": 0.29309401688785314,
 'vein': 0.6985591249960175,
 'gensan': 0.6985591249960175,
 'ke': -0.4000531636720922,
 'steal': -0.6877352361238731,
 'outstand': 0.6985591249960175,
 'paus': -0.6877352361238731,
 'yesha': 0.6985591249960175,
 'woohoo': 1.3917063055559629,
 'success': 0