# Sentiment Classification using Naive Bayes with MLE and MAP Estimates

In [1]:
import os
import math
import numpy as np
import random
import matplotlib.pyplot as plt

In [2]:
class NaiveBayes:
    """
        NaiveBayes class for Sentiment Analysis
    """
    def __init__(self):
        
        self.vocab = set()
        
        # Word count given class
        self.positive_word_count = {}
        self.negative_word_count = {}
        
        # class wise label count
        self.neg_label_count = 0
        self.pos_label_count = 0
        
        # Class wise total token count
        self.total_pos_tokens = 0
        self.total_neg_tokens = 0
        
        # Class Wise Max Likelihood Estimates
        # P(+) and P(-)
        self.P_Mle = None
        self.N_Mle = None
        
        # P(w|+) & P(w|-)
        self.positive_tokens_mle = {}
        self.negative_tokens_mle = {}
        
        # MAP Estimates Dictionary
        self.positive_map_estimates = {}
        self.negative_map_estimates = {}
        
                
    def fit(self, train_data):
        
        for item in train_data:
    
            text = item['text']
            label = item['label']

            if label == '0':
                self.neg_label_count += 1
            if label == '1':
                self.pos_label_count += 1

            # Creating Vocab and Token Frequency Dictionary
            text = text.split()
            for token in text:

                self.vocab.add(token)

                if label=='0':
                    if token in self.negative_word_count:
                        self.negative_word_count[token] += 1
                    else:
                        self.negative_word_count[token] = 1
                    
                    self.total_neg_tokens += 1
                    
                elif label == '1':
                    if token in self.positive_word_count:
                        self.positive_word_count[token] += 1
                    else:
                        self.positive_word_count[token] = 1
                    
                    self.total_pos_tokens += 1
                else:
                    pass
        
        """
            MLE Calculation for Both Classes
        """
        self.P_Mle = self.pos_label_count/(self.pos_label_count + self.neg_label_count)

        self.N_Mle = self.neg_label_count/(self.pos_label_count + self.neg_label_count)
        
#         print(f"MLE for '+' class P(+): {self.P_Mle}\nMLE for '-' class P(-): {self.N_Mle}")
        
        
    def compute_MLE(self):
        """
            Maximum Likelihood Computations
        """
        
        # MaxL P(w|+)
        for token in self.vocab:
            self.positive_tokens_mle[token] = self.positive_word_count.get(token, 0)/self.total_pos_tokens


        # MaxL P(w|-)
        for token in self.vocab:
            self.negative_tokens_mle[token] = self.negative_word_count.get(token, 0)/self.total_neg_tokens
            
    def compute_MAP(self, m):
        """
            Given parameter: m
            computes MAP estimates for both classes
        """
        self.positive_map_estimates = {}
        self.negative_map_estimates = {}

        """
            MAP P(w|+)
        """
        for token in self.vocab:
            self.positive_map_estimates[token] = \
            (self.positive_word_count.get(token, 0) + m)/(self.total_pos_tokens + m*len(self.vocab))
            
        """
            MAP P(w|-)
        """
        for token in self.vocab:
            self.negative_map_estimates[token] = \
            (self.negative_word_count.get(token, 0) + m)/(self.total_neg_tokens + m*len(self.vocab))
            
    
    def predict_with_MLE(self, train_data):
        
        predictions = []
        for rec in train_data:
            
            text = rec['text']
            label = int(rec['label'].strip())

            positive_score = math.log(self.P_Mle)
            negative_score = math.log(self.N_Mle)

            for token in text.split():

                if (token in self.positive_tokens_mle) or (token in self.negative_tokens_mle):

                    # for positive class
                    score = self.positive_tokens_mle.get(token, 0)
                    if score == 0:
                        positive_score += -math.inf
                    else:
                        positive_score += math.log(score)

                    # for negative class
                    score = self.negative_tokens_mle.get(token, 0)
                    if score == 0:
                        negative_score += -math.inf
                    else:
                        negative_score += math.log(score)

            if positive_score >= negative_score:
                predictions.append('1')
            else:
                predictions.append('0')
        
        return predictions
    
    def predict_with_MAP(self, train_data):
        
        predictions = []
        for rec in train_data:
            
            text = rec['text']
            label = int(rec['label'].strip())

            positive_score = math.log(self.P_Mle)
            negative_score = math.log(self.N_Mle)

            for token in text.split():

                if (token in self.positive_map_estimates) or (token in self.negative_map_estimates):

                    # for positive class
                    score = self.positive_map_estimates.get(token, 0)
                    if score == 0:
                        positive_score += -math.inf
                    else:
                        positive_score += math.log(score)

                    # for negative class
                    score = self.negative_map_estimates.get(token, 0)
                    if score == 0:
                        negative_score += -math.inf
                    else:
                        negative_score += math.log(score)

            if positive_score >= negative_score:
                predictions.append('1')
            else:
                predictions.append('0')
        
        return predictions

# Steps to run

### Test and Train Data should be in the following format
### List of dictionaries with text and label as keys
### Ex: [{"text":"This city is beautiful", "label":1}]

In [None]:
model = NaiveBayes()
model.fit(train_data=None)

## To Classify using Maximum Likelihood Estimates

In [None]:
model.predict_with_MLE(None) # Pass the test_data to be classified
model.compute_MLE()
predictions = model.predict_with_MLE(test)

### To Classify using MAP Estimates, using a Dirchilet prior with Smoothing Parameter: m

In [None]:
model.compute_MAP(m=m) # m is the smoothing parameter
predictions = model.predict_with_MAP(test)