In [3]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

In [4]:
import re
from nltk.stem import PorterStemmer

In [5]:
def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    '''
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    #removing stock market tickers like $GE
    tweet = re.sub(r'\$\w*','',tweet)
    #removing old style retweet text
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    #removing numbers
    tweet = re.sub(r'[0-9]','', tweet)
    #removing hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','', tweet)
    #removing hashtags
    tweet = re.sub(r'#','', tweet)
    #tokenizing tweets
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True,
                              reduce_len = True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    
    return tweets_clean

In [6]:
def lookup(freqs,word,label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair/tuple
        word: the word to look up
        label: the label corresponding to the word
    Output: 
        n: number of times the word with its corresponding label appears
    '''
    n = 0
    
    pair = (word,label)
    if(pair in freqs):
        n = freqs[pair]
    return n

In [7]:
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to C:\Users\SREE
[nltk_data]     BHATTACHARYYA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to C:\Users\SREE
[nltk_data]     BHATTACHARYYA\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [8]:
#getting the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

#split the data into two pieces: training and validation
test_pos = all_positive_tweets[4000:]
test_neg = all_negative_tweets[4000:]
train_pos = all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

#avoids assumptions about the length of pos tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))



In [9]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y, tweet in zip(ys,tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair]+=1
            else:
                result[pair] = 1
    return result
        

In [10]:
freqs = count_tweets({}, train_x, train_y)

In [11]:
print(type(train_y))

<class 'numpy.ndarray'>


In [12]:
def train_naive_bayes(freqs, train_x,train_y):
    '''
    Input: 
        freqs: dictionary from (word,label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels corresponding to tweets
    Output:
        logprior: the log prior
        loglikelihood: the log likelihood of Naive Bayes
    '''
    
    loglikelihood = {}
    logprior = 0
    
    #calculating V: the no. of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    
    N_pos = N_neg = 0
    for pair in freqs.keys():
        #if the label is positive
        if pair[1] == 1 :
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]
    
    D = len(train_y)
    
    D_pos = np.sum(train_y)
    D_neg = D - D_pos
    
    logprior = np.log(D_pos) - np.log(D_neg)
    
    for word in vocab:
        
        #get the positive and negative frequency of the word
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)
        
        #calculating probability
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
        
    return logprior, loglikelihood, vocab
    
    

In [13]:
logprior, loglikelihood, vocab = train_naive_bayes(freqs, train_x,train_y)
print(logprior)
print((loglikelihood))
print(len(vocab))

0.0
{'nvm': -1.0897078661245285, 'anymor': -1.495172974232693, 'sensiesha': -0.6842427580163642, 'ralli': 0.7020516031035264, 'frequentfly': 0.7020516031035264, 'bear': 0.008904422543581099, 'tomorrow': 0.17880345933897862, 'link': 0.34537665916479404, 'fallen': -1.3773899385763095, 'raincoat': 0.7020516031035264, 'cooper': -0.6842427580163642, 'mastership': 0.7020516031035264, "could'v": -0.6842427580163642, 'salam': -0.6842427580163642, 'arummzz': 1.1075167112116908, 'kita': -1.0897078661245285, 'cori': -0.6842427580163642, 'alright': 0.4143695306517454, 'needa': 0.7020516031035264, 'yot': -0.6842427580163642, 'injustic': 0.7020516031035264, 'lighter': 0.7020516031035264, 'crime': -0.6842427580163642, 'mne': 0.7020516031035264, "sod'": -0.6842427580163642, '☆': 1.3951987836634716, 'heswifi': 0.7020516031035264, 'madatt': -0.6842427580163642, 'outstand': 0.7020516031035264, 'tradit': -0.6842427580163642, 'scotland': 0.4143695306517454, 'bank': -1.495172974232693, 'sew': -0.68424275801

In [14]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number (logprior from training data)
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all loglikelihoods of each word in the tweet
            + the logprior
    '''
    
    #processing tweet
    word_l = process_tweet(tweet)
    
    #initializing probability to zero
    p = 0
    
    #add the logprior
    p += logprior
    
    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
            
    return p
    

In [15]:
#experimenting with own tweet
my_tweet = "She smiled."
p = naive_bayes_predict(my_tweet,logprior, loglikelihood)
print(f"The expected output is {p}")

The expected output is 1.5775203404574265


In [16]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    '''
    Input: 
        text_x: a list of tweets
        test_y: corresponding labels
        logprior, loglikelihood
    Output:
        accuracy: (# of tweets correctly classified)/(total no. of tweets)
    '''
    
    accuracy = 0
    
    y_hats = []
    for tweet in test_x:
        #if prediction is greater than 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0
        
        y_hats.append(y_hat_i)
        
    error = (np.sum(np.absolute(y_hats - test_y)))/len(test_y)
    accuracy = 1-error
    
    return accuracy

        
        

In [17]:
print(f"Naive Bayes accuracy = {test_naive_bayes(test_x,test_y,logprior, loglikelihood)}")

Naive Bayes accuracy = 0.994


In [18]:
for tweet in ["I am happy", "I am bad", "this movie should have been great.", "great", "great great", "great great great", "great great great great"]:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f"{tweet} -> {p:.2f}")

I am happy -> 2.15
I am bad -> -1.21
this movie should have been great. -> 2.15
great -> 2.14
great great -> 4.28
great great great -> 6.42
great great great great -> 8.57


In [19]:
my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-8.71913013176903

In [40]:
def get_ratio(freqs, word):
    '''
    Input:
        freqs: dictionary containing the words
        word: string to lookup
    Output:
        a dictionary with keys'''
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    pos_neg_ratio['positive'] = lookup(freqs, word, 1)
    pos_neg_ratio['negative'] = lookup(freqs,word,0)
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive']+1)/(pos_neg_ratio['negative']+1) 
    return pos_neg_ratio
    

In [41]:
get_ratio(freqs,'happi')

{'positive': 161, 'negative': 18, 'ratio': 8.526315789473685}

In [44]:
def get_words_by_threshold(freqs,label,threshold):
    '''
    Input: 
        freqs: dictionary of words
        label: 1 for positive, 0 for negative
        threshold: ratio that will be used as cutoff
    Output:
        word_set: dictionary containing words and relevant info
    '''
    
    word_list = {}
    
    for key in freqs.keys():
        word, lab= key
        
        pos_neg_ratio = get_ratio(freqs, word)['ratio']
        
        if label == 1 and pos_neg_ratio >= threshold:
            word_list[word] = get_ratio(freqs,word)
            continue
        if label == 0 and pos_neg_ratio <= threshold:
            word_list[word] = get_ratio(freqs,word)
        
    return word_list

In [45]:
# Test your function: find negative words at or below a threshold
get_words_by_threshold(freqs, label=0, threshold=0.05)

{':(': {'positive': 1, 'negative': 3663, 'ratio': 0.0005458515283842794},
 ':-(': {'positive': 0, 'negative': 378, 'ratio': 0.002638522427440633},
 'zayniscomingbackonjuli': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '>:(': {'positive': 0, 'negative': 43, 'ratio': 0.022727272727272728},
 'lost': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 '》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 'beli̇ev': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'wi̇ll': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'justi̇n': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｓｅｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｍｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}}

In [46]:
# Test your function; find positive words at or above a threshold
get_words_by_threshold(freqs, label=1, threshold=10)

{'followfriday': {'positive': 23, 'negative': 0, 'ratio': 24.0},
 'commun': {'positive': 27, 'negative': 1, 'ratio': 14.0},
 ':)': {'positive': 2847, 'negative': 2, 'ratio': 949.3333333333334},
 'flipkartfashionfriday': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':D': {'positive': 499, 'negative': 0, 'ratio': 500.0},
 ':p': {'positive': 104, 'negative': 1, 'ratio': 52.5},
 'influenc': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':-)': {'positive': 543, 'negative': 0, 'ratio': 544.0},
 "here'": {'positive': 20, 'negative': 0, 'ratio': 21.0},
 'youth': {'positive': 14, 'negative': 0, 'ratio': 15.0},
 'bam': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'warsaw': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'shout': {'positive': 11, 'negative': 0, 'ratio': 12.0},
 ';)': {'positive': 22, 'negative': 0, 'ratio': 23.0},
 'stat': {'positive': 51, 'negative': 0, 'ratio': 52.0},
 'arriv': {'positive': 57, 'negative': 4, 'ratio': 11.6},
 'via': {'positive': 60, 'negative': 1, '

In [47]:
#error analysis
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b''
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall harri born ik stupid wanna chang :D'
1	0.00	b''
1	0.00	b''
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'
0	1.00	b'whatev stil l young >:-('


In [48]:
# Test with your own tweet - feel free to modify `my_tweet`
my_tweet = 'I am happy because I am learning :)'

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

9.585246395495865
