In [None]:
%%capture
%pip install 

## Naive Bayes Classifier: A Probabilistic Approach to Classification

The Naive Bayes classifier is a simple yet powerful algorithm for **text classification tasks** like sentiment analysis. It works by making the **assumption of conditional independence** between features (words in a text). This assumption allows it to efficiently calculate the probability of a text belonging to a specific class (e.g., positive or negative sentiment) based on the individual probabilities of its constituent words.

**Here's a brief overview of the steps involved:**

**1. Training:**

* Analyze a training dataset containing labeled texts (e.g., positive and negative tweets).
* Calculate the **probability of each word** appearing in each class (positive and negative).
* Calculate the **prior probability** of each class (e.g., the proportion of positive and negative examples in the training data).

**2. Prediction:**

* For a new, unseen text:
    * Estimate the **probability of each word appearing** in the text given each class (positive or negative) using the probabilities calculated during training.
    * Combine these individual word probabilities using **Bayes' theorem** to get the **overall probability** of the text belonging to each class.
    * **Classify the text** to the class with the **highest probability**.

**Benefits:**

* **Simple to implement and understand:** Easy to explain and interpret compared to some complex models.
* **Efficient training:** Requires less training data compared to some other algorithms.
* **Effective for text classification:** Performs well on tasks like sentiment analysis and spam filtering.

**Limitations:**

* **Conditional independence assumption:** May not hold true in real-world data, potentially affecting accuracy.
* **Sensitivity to rare words:** Can struggle with words that rarely appear in the training data.

**Overall, the Naive Bayes classifier offers a robust and efficient approach to text classification tasks**, making it a popular choice for applications like sentiment analysis and spam filtering. It's a good starting point for beginners due to its simplicity and interpretability.


In [19]:
import string
import re

In [20]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

## Filtering the Tweet Text to remove unwanted noise in the tweet

In [1]:
def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''

    # Create a custom stopwords list excluding "not"
    stopwords_english = list(stopwords.words('english'))
    # Remove "not" from the list
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # Lemmatization instead of stemming
            lemmatizer = WordNetLemmatizer()
            lemma_word = lemmatizer.lemmatize(word)
            tweets_clean.append(lemma_word)

    return tweets_clean


In [2]:
def test_lookup(func):
    freqs = {('sad', 0): 4,
             ('happy', 1): 12,
             ('oppressed', 0): 7}
    word = 'happy'
    label = 1
    if func(freqs, word, label) == 12:
        return 'SUCCESS!!'

    return 'Failed Sanity Check!'


In [3]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [4]:
# Define test data
freqs = {('sad', 0): 4,
         ('happy', 1): 12,
         ('oppressed', 0): 7}

# Call test function
result = test_lookup(lookup)

# Print the result
print(result)

SUCCESS!!


In [None]:
{
    ('sad', 0): 4,
    ('happy', 1): 12,
    ('oppressed', 0): 7
}

### Importing Useful Libraries

In [5]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

In [6]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Using Sample Tweets from the Natural Language ToolKit(NLTK)

In [8]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [9]:
all_positive_tweets[:5]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days']

In [10]:
all_negative_tweets[:5]

['hopeless for tmr :(',
 "Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
 '@Hegelbon That heart sliding into the waste basket. :(',
 '“@ketchBurning: I hate Japanese call him "bani" :( :(”\n\nMe too',
 'Dang starting next week I have "work" :(']

### Splitting the Training and Test Data Set in 7:3 ratio
* training_data = 0.7 * total_data
* test_data = 0.3 * total_data

In [11]:
from sklearn.model_selection import train_test_split

# Split positive tweets
train_pos, test_pos = train_test_split(all_positive_tweets, test_size=0.3, random_state=42)

# Split negative tweets
train_neg, test_neg = train_test_split(all_negative_tweets, test_size=0.3, random_state=42)



In [12]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [13]:
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [14]:
train_y

array([1., 1., 1., ..., 0., 0., 0.])

In [15]:
test_y

array([1., 1., 1., ..., 0., 0., 0.])

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [21]:
custom_tweet = "RT @Twitter @chapagain not nor Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morning']


In [18]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word,y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1

    return result

In [22]:
freqs = count_tweets({}, train_x, train_y)

In [23]:
import itertools


In [24]:
dict(itertools.islice(freqs.items(), 5))

{('aaahhh', 1.0): 1,
 ('see', 1.0): 118,
 ('...', 1.0): 215,
 (':)', 1.0): 2481,
 ('blogged', 1.0): 1}

In [25]:
len(freqs)

11441

In [26]:
from collections import OrderedDict

last_20_items = OrderedDict(list(freqs.items())[-20:])
print(last_20_items)

OrderedDict([(('landlord', 0.0), 1), (("mp's", 0.0), 1), (('apt', 0.0), 1), (('building', 0.0), 1), (('bldg', 0.0), 1), (('shouldve', 0.0), 1), (('muster', 0.0), 1), (('merchs', 0.0), 1), (('cancelling', 0.0), 1), (('needicecreamnow', 0.0), 1), (('livestream', 0.0), 1), (('vitamin', 0.0), 1), (('oil', 0.0), 1), (('healthier', 0.0), 1), (('stretch', 0.0), 1), (('himseek', 0.0), 1), (('kikmsn', 0.0), 1), (('kissme', 0.0), 1), (('akua', 0.0), 1), (('owns', 0.0), 1)])


# Defining the Naive Bayes Function to train the training data set

Naive bayes is an algorithm that could be used for sentiment analysis. It takes a short time to train and also has a short prediction time.

##### So how do we train a Naive Bayes classifier?
</br>
Given a freqs dictionary, `train_x` (a list of tweets) and a `train_y` (a list of labels for each tweet), implement a naive bayes classifier.

##### Calculate $V$
- You can then compute the number of unique words that appear in the `freqs` dictionary to get your $V$ (you can use the `set` function).

##### Calculate $freq_{pos}$ and $freq_{neg}$
- Using your `freqs` dictionary, you can compute the positive and negative frequency of each word $freq_{pos}$ and $freq_{neg}$.

##### Calculate $N_{pos}$ and $N_{neg}$
- Using `freqs` dictionary, you can also compute the total number of positive words and total number of negative words $N_{pos}$ and $N_{neg}$.

##### Calculate $D$, $D_{pos}$, $D_{neg}$
- Using the `train_y` input list of labels, calculate the number of documents (tweets) $D$, as well as the number of positive documents (tweets) $D_{pos}$ and number of negative documents (tweets) $D_{neg}$.
- Calculate the probability that a document (tweet) is positive $P(D_{pos})$, and the probability that a document (tweet) is negative $P(D_{neg})$

##### Calculate the logprior
- the logprior is $log(D_{pos}) - log(D_{neg})$

##### Calculate log likelihood
- Finally, you can iterate over each word in the vocabulary, use your `lookup` function to get the positive frequencies, $freq_{pos}$, and the negative frequencies, $freq_{neg}$, for that specific word.
- Compute the positive probability of each word $P(W_{pos})$, negative probability of each word $P(W_{neg})$ using equations 4 & 5.

$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V}\tag{4} $$
$$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V}\tag{5} $$

**Note:** We'll use a dictionary to store the log likelihoods for each word.  The key is the word, the value is the log likelihood of that word).

- You can then compute the loglikelihood: $log \left( \frac{P(W_{pos})}{P(W_{neg})} \right)\tag{6}$.


In [27]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0


    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos=N_neg=V_pos=V_neg=0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # increment the count of unique positive words by 1
            V_pos += 1

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = train_y.shape[0]

    # Calculate D_pos, the number of positive documents
    D_pos = train_y[train_y == 1].shape[0]

    # Calculate D_neg, the number of negative documents
    D_neg = train_y[train_y == 0].shape[0]

    # Calculate logprior
    logprior  = np.log(D_pos / D) - np.log(D_neg / D)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word, 1), 0)
        freq_neg = freqs.get((word, 0), 0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)


    return logprior, loglikelihood

In [28]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9239


## Naive Bayes Prediction

In [29]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p


# Negative Sentiment Prompted Tweet

In [30]:
my_tweet = 'Bad weather.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is -2.194295493603877


# Positive Sentiment Prompted Tweet

In [31]:
my_tweet = 'Sunny day.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.6558807823524089


In [33]:
new_tweet = input("Post a Tweet!")
p = naive_bayes_predict(new_tweet, logprior, loglikelihood)
if p>0 :
  print("Positive Tweet")
else:
  print("Negative Tweet")

Post a Tweet!Karma is a bitch.
Negative Tweet


In [None]:
new_tweet = input("Post a Tweet!")
p = naive_bayes_predict(new_tweet, logprior, loglikelihood)
if p>0 :
  print("Positive Tweet")
else:
  print("Negative Tweet")


Post a Tweet!Trump start crime in usa
Negative Tweet


## Accuracy Measurement of Our Naive Bayes

In [None]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = sum(abs(test_y-y_hats))/len(y_hats)

    # Accuracy is 1 minus the error
    accuracy = 1-error

    return accuracy


In [None]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9900
