# Naive Bayes

In [2]:
from utils import process_tweet, lookup
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to C:\Users\shaurya
[nltk_data]     shekhar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to C:\Users\shaurya
[nltk_data]     shekhar\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True


```
nltk.download('stopwords')
nltk.download('twitter_samples')
```

In [3]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [4]:

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

# Step 1: Processing the Data

With the use of process_tweet() we process the input properly



In [6]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [8]:
def count_tweets(result, tweets, ys):
    
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    return result

In [9]:
# Testing your function


result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

# Step 2: Training the model using Naive Bayes

Naive bayes is an algorithm that could be used for sentiment analysis. It takes a short time to train and also has a short prediction time.

- The first part of training a naive bayes classifier is to identify the number of classes that you have.
- Then create a probability for each class.

In [10]:
freqs = count_tweets({}, train_x, train_y)

In [11]:
def train_naive_bayes(freqs, train_x, train_y):

    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    D = len(train_y)
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    logprior = np.log(D_pos) - np.log(D_neg)


    for word in vocab:
        
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood


In [12]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9084


# Step 3: Testing naive bayes

Now that we have the `logprior` and `loglikelihood`, we can test the naive bayes function by making predicting on some tweets!

`naive_bayes_predict` function to makes predictions on tweets.
* The function takes in the `tweet`, `logprior`, `loglikelihood`.
* It returns the probability that the tweet belongs to the positive or negative class.
* For each tweet, it sums up loglikelihoods of each word in the tweet.
* Also adds the logprior to this sum to get the predicted sentiment of that tweet.

$$ p = logprior + \sum_i^N (loglikelihood_i)$$

#### Note
Note we calculate the prior from the training data, and that the training data is evenly split between positive and negative labels (4000 positive and 4000 negative tweets).  This means that the ratio of positive to negative 1, and the logprior is 0.

In [13]:
def naive_bayes_predict(tweet, logprior, loglikelihood):

    word_l = process_tweet(tweet)
    p = 0

    p += logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p


In [14]:
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5737795839220972


In [15]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):

    accuracy = 0  

    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0
        y_hats.append(y_hat_i)
        
    error = np.mean(np.absolute(y_hats-test_y))
    accuracy = 1-error
    
    return accuracy


In [23]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


In [21]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
#     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.15
I am bad -> -1.29
this movie should have been great. -> 2.14
great -> 2.14
great great -> 4.28
great great great -> 6.41
great great great great -> 8.55


In [22]:
my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-8.802119197347832

# Step 4: Twitter API Usage


In [19]:
import tweepy

In [28]:
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""


def get_all_tweets(screen_name):

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    alltweets = []  
    
    new_tweets = api.user_timeline(screen_name = screen_name,count=200)

    alltweets.extend(new_tweets)

    oldest = alltweets[-1].id - 1

    while len(new_tweets) > 0:
        print(f"getting tweets before {oldest}")

        new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)

        alltweets.extend(new_tweets)

        oldest = alltweets[-1].id - 1
        
        print(f"...{len(alltweets)} tweets downloaded so far")
 
    outtweets = [tweet.text for tweet in alltweets]
    return outtweets

In [36]:
def IsHarmful(username):
    tweets = get_all_tweets(username)
    p = 0
    bad = []
    for tweet in tweets:
        k = naive_bayes_predict(tweet, logprior, loglikelihood)
        p += k
        if(k<-8.5):
            bad.append(tweet)
            
    ans = p/len(tweets)
    print(ans)
    if(ans<-2.5):
        print("Harmful")
    else:
        print("Neutral")
    return bad
example = IsHarmful("BarackObama")

for t in example:
    print(t)
    break

getting tweets before 1333833279775141887
...400 tweets downloaded so far
getting tweets before 1271147930460831750
...600 tweets downloaded so far
getting tweets before 1151920811093237759
...800 tweets downloaded so far
getting tweets before 922584337815613440
...1000 tweets downloaded so far
getting tweets before 776532109498339327
...1200 tweets downloaded so far
getting tweets before 749683934548619263
...1400 tweets downloaded so far
getting tweets before 727165574325800960
...1600 tweets downloaded so far
getting tweets before 705495403827859455
...1800 tweets downloaded so far
getting tweets before 687100701273493503
...2000 tweets downloaded so far
getting tweets before 669583869755027455
...2200 tweets downloaded so far
getting tweets before 648913451654316031
...2400 tweets downloaded so far
getting tweets before 628967276759244799
...2600 tweets downloaded so far
getting tweets before 616316996917985279
...2800 tweets downloaded so far
getting tweets before 5992726259326320

# Step 5: Error Analysis (Analysing Scope for improvement)

In [21]:
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b''
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :D'
1	0.00	b''
1	0.00	b''
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'
0	1.00	b'whatev stil l young >:-('


# Step 6: Custom Text

In [22]:
my_tweet = 'I am happy because I am learning :)'

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

9.574768961173339
