In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import nltk

In [2]:
data = pd.read_csv('../Data/preprocessed.csv')

In [3]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [4]:
encoded_labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [4]:
train_text, test_text, train_label, test_label = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [5]:
vec = CountVectorizer(max_features = 3000)
x = vec.fit_transform(train_text)
vocab = vec.get_feature_names()
x = x.toarray()



In [6]:
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(x.shape[0]):
    l = train_label[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += x[i][j]

In [7]:
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

In [8]:
# Count the total words present in each label
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

In [9]:
def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

In [10]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [11]:
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

In [12]:
labels = [0,1]
n_label_items, log_label_priors = fit(train_text,train_label,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_text)
print("Accuracy of prediction on test set : ", accuracy_score(test_label,pred))

Accuracy of prediction on test set :  0.83608
