Naive bayes is an algorithm that could be used for sentiment analysis. It takes very short time to train and the basic assumption is that the words in the sentence are independent of eachother.



In [None]:
# importing the libraries 
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
import nltk.tokenize as tokenize
import re
import string
from collections import defaultdict

In [None]:
data = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
data.head()

In [None]:
# We can see that the data consisits of 50000 reviews , with equal number of positve and negative reviews
print(data.shape)
data.sentiment.value_counts()

In [None]:
data.isnull().sum()

First thing we need to do is process the input , so we can make useful inputs to the model-
* Lowercase the reviews.
* remove all the stopwords ( common words lke the ,a etc) and punctuations from the reviews.
* stem the words , we dont want to count dance , danced , dancing as different words , so when we stem it , the output becomes danc for all the three, and is only taken once in the vocab building.



In [None]:
def process_review(review):
  
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    review = review.lower()
 
    review_tokens = tokenize.wordpunct_tokenize(review)
    

    review_clean = []
    for word in review_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            review_clean.append(stem_word)

    return review_clean

Then we create a function , where we store the counts of the words for the particular classes . The freq dict. created will be later used in the training.

In [None]:
def count_reviews(reviews, sentiment):
 
    vocab_c = defaultdict(int)
    for y, review in zip(sentiment, reviews):
        for word in process_review(review):
            pair = (word,y)

            if pair in vocab_c:
                vocab_c[pair] += 1

            else:
                vocab_c[pair] = 1

    return vocab_c

In [None]:
freqs = count_reviews(data.review[:35000], data.sentiment[:35000])

There are a couple of steps for training the naive bayes classifer.
* We find the  prior probability - in simple terms if we had picked a review on random from the group of reviews with no specific info , what is probability if the sentiment is positive vs negative.So probabilty of review being positive is number of positive reviews in data ,divided by total reviews
$$\text{logprior} = log \left( \frac{P(D_{pos})}{P(D_{neg})} \right) = log \left( \frac{D_{pos}}{D_{neg}} \right)$$
* We find the likelhood of words i.e.  
$$\text{loglikelihood} = \log \left(\frac{P(W_{pos})}{P(W_{neg})} \right)\$$

where  $P(W_{pos})$ is the probability of word given the review is positve 
$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V}\} $$ (with laplace smoothing)

In [None]:
def train_naive_bayes(freqs, train_x, train_y):

    loglikelihood = {}
    logprior = 0


    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] == 'positive':

            N_pos += freqs[pair]

        else:

            N_neg += freqs[pair]


    logprior = np.log ( (train_y == 'positive').sum() / (train_y == 'negative').sum())

    for word in vocab:
        freq_pos = freqs[(word,'positive')]
        freq_neg = freqs[(word,'negative')]

        p_w_pos = freq_pos + 1 / (N_pos + V)
        p_w_neg = freq_neg + 1 / (N_neg + V)

        loglikelihood[word] = np.log(p_w_pos / p_w_neg)


    return logprior, loglikelihood

In [None]:
logprior, loglikelihood = train_naive_bayes(freqs,data.review[:35000], data.sentiment[:35000])
print(logprior)
print(len(loglikelihood))

Now we predict whether the sentiment is  postive (p >0) or negative (p<0) . p is more of posterior probability of review being postive given the word divided by the posterior probability of review being negative given the word . If the posterior probability of review being positive is more than the negative , the fraction becomes more than 1 , and log(fraction) would be greater than 0 (p>0

In [None]:
def naive_bayes_predict(review, logprior, loglikelihood):

    word_l = process_review(review)

    p = 0

    p += logprior

    for word in word_l:

        if word in loglikelihood:
            p += loglikelihood[word]


    return p

In [None]:
p = naive_bayes_predict(data.review[42090], logprior, loglikelihood)
print(data.review[42090])
print(p)
print(data.sentiment[42090])

In [None]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
 
    accuracy = 0  

    y_hats = []
    for review in test_x:
        if naive_bayes_predict(review, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        y_hats.append(y_hat_i)    

    accuracy = (((test_y[0] == np.array(y_hats))).sum()) / len(test_y[0])


    return accuracy

In [None]:
Accuracy = test_naive_bayes(data.review[35000:50000], pd.factorize(data.sentiment[35000:50000]), logprior, loglikelihood)

We get to an accuracy of 82% , not at all bad for a simple classifier

In [None]:
print(Accuracy)