# Text Classification using Naive Bayes

### Sentiment Analysis :
*Sentiment Analysis implemented using Bag of words model.*

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import re
import math

In [2]:
# Reading data
file = pd.read_csv('sa.csv')
print(file.shape)
file.head(12)

(12, 2)


Unnamed: 0,sentiment,review
0,1,it was Good restaurant
1,0,they serve Bad food
2,1,food was tasty
3,1,service was good
4,1,it is a hygenic place
5,0,they provide bad service
6,0,small and dirty place
7,1,Prices are okay
8,0,Too costly
9,0,they have worst delivery


In [3]:
# Attributes
classItems = {} # no. of class items in a class
class_priorProb = {} # Prior Probability of each class
bow = {} # Bag of words
vocab = set() # Vocabulary of the model

In [4]:
# Classifying data into classes
def classify(file):
    classes = [0,1]
    data = {}
    for i in range(len(file)):
        row = file[i]
        c = row[0]
        if c not in data:
            data[c] = list()
        data[c].append(row[1])
    return data

In [5]:
# Cleaning and Spliting the string into words
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text, flags=re.IGNORECASE) # every char except alphabets gets replaced with single space 
    text = re.sub("(\s+)", " ", text) # multiple spaces are replaced by single spaces
    text = text.lower() # converting cleaned string to lowercase
    words = []
    for token_word in text.split(): # cleaned string is split into words
        words.append(token_word)
    return words

In [6]:
# Train the model
def fit(file):
    n = file.shape[0] 
    data = classify(file.values.tolist())

    for c, values in data.items():
        classItems[c] = len(values)
        class_priorProb[c] = (classItems[c] / n)
        bow[c] = defaultdict(lambda: 0)
            
        for text in values:
            counts = Counter(tokenize(text))
            for word, count in counts.items():
                if word not in vocab:
                    vocab.add(word)

                bow[c][word] += count

In [7]:
def laplace_smoothing(word, text_class):
      num = bow[text_class][word] + 1
      denom = classItems[text_class] + len(vocab)
      return num / denom

In [8]:
# Testing the model
def predict(X):
        result = []
        max_prob = []
        classes = [0,1]
        for text in X:
          class_scores = {c : class_priorProb[c] for c in classes}
          words = set(tokenize(text))
          for word in words:
              for c in classes:
                likelihood = laplace_smoothing(word, c)
                class_scores[c] += likelihood
                
          result.append(max(class_scores, key = class_scores.get))
          max_prob.append(max(class_scores.values()))

        return result, max_prob

### 0 : Bad sentiment , 1 : Good sentiment

In [9]:
# Running.../
fit(file)
test = ['this is the good food i have ever ate.',
        'bad place i ever found']
predict(test)

([1, 0], [0.8823529411764707, 0.7647058823529412])