## Naive bayes classifier for sentiment analysis

Naive bayes is a probabilistic Bayesian model which calcultes the predictions based on probabilities although naive bayes is not extensively used in practise the ideas and initiation is used throughout Natural Language Processing(NLP).

In this notebook, we will look into building a quick baseline sentiment analysis model positive vs negative using naive bayes.

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords, twitter_samples
import nltk
import re 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

nltk.download('twitter_samples', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [2]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [3]:
#getting all the positive tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

#creating training and test sets
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x = train_pos + train_neg
test_x = test_pos + test_neg

#creating the labels
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [4]:
#Helper functions

def process_tweet(tweet):
    """ The function cleans and retuens a processed tweet"""
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    clean_tweets = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            clean_tweets.append(stem_word)
    return clean_tweets

def lookup(freqs, word, label):
    """This functions looksup the number of times a word occurs"""
    n = 0
    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

def count_tweets(result, tweets, ys):
    """This functions counts the tweets based on sentiment"""
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    return result

In [5]:
freqs = count_tweets({}, train_x, train_y)

def naive_bayes(freqs, train_x, train_y):
    loglikeihood = {}
    logprior = 0
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    
    N_pos = 0
    N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]
            
    D = len(train_y)
    D_pos = np.sum(train_y)
    D_neg = D - D_pos
    logprior = np.log(D_pos) - np.log(D_neg)
    
    for word in vocab:
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        loglikeihood[word] = np.log(p_w_pos/p_w_neg)
    return logprior, loglikeihood

logprior, loglikelihood = naive_bayes(freqs, train_x, train_y)
print(logprior, len(loglikelihood))

0.0 9162


In [6]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_list = process_tweet(tweet)
    p = 0
    p += logprior
    for word in word_list:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p

my_tweet = "I'm Happy today!!"
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(f"The expected output is {p}")

The expected output is 1.5105245506722396


In [7]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood, naive_bayes_predict = naive_bayes_predict):
    accuracy = 0
    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0
            
    error = np.mean(np.absolute(y_hats-test_y))
    accuracy = 1 - error
    return accuracy

def get_ratio(freqs, word):
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    pos_neg_ratio['positive'] = lookup(freqs,word,1)
    pos_neg_ratio['negative'] = lookup(freqs,word,0)
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1)/(pos_neg_ratio['negative'] + 1)
    return pos_neg_ratio

get_ratio(freqs, 'sad')

{'positive': 5, 'negative': 100, 'ratio': 0.0594059405940594}

### Predict

In [8]:
def naive_bayes_classify(tweet, logprior, loglikelihood, threshold=0.5):
    word_list = process_tweet(tweet)
    p = logprior
    for word in word_list:
        if word in loglikelihood:
            p += loglikelihood[word]
    sentiment = 1 if p > threshold else 0
    return f"Predicted Sentiment: {'Positive' if sentiment == 1 else 'Negative'}"

In [9]:
my_tweet = 'Im happy as its my birthday'

p = naive_bayes_classify(my_tweet, logprior, loglikelihood)
print(p)

Predicted Sentiment: Positive


In [10]:
my_tweet = 'Today is a sad day'

p = naive_bayes_classify(my_tweet, logprior, loglikelihood)
print(p)

Predicted Sentiment: Negative
