In [1]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
import pandas as pd



One of the most import libraries for natural language processing is NLTK. It provides more than 50 corpora and lexical resources and interfaces to work with them, also it provides text processing libraries including tokenization, stemming, parsing, classification and etc.

The next step is to load a data set to work with.Amazon reviews dataset which contains 360000 text reviews and their tags is used. This dataset contains positive and negative reviews. The task is to classify reviews into either of classes. This task is known as sentiment analysis in NLP.

Function to read the dataset

In [2]:
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

Now changing to labels.0 for negative,1 for poisitive reviews.

In [3]:
def reviewToY(review):
    return 0 if review.split(' ')[0] == '__label__1' else 1 

In [4]:
def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [5]:
#print confusion matrix
def conf_mat(preds, y):
    TN=0
    FP=0
    FN=0
    TP=0
    for x in range(len(preds)):
        if preds[x] == 0 and y[x] == 0:
            TN+=1
        elif preds[x] == 1 and y[x] == 1:
            TP+=1
        elif preds[x] == 0 and y[x] == 1:
            FN+=1
        elif preds[x] == 1 and y[x] == 0:
            FP+=1
    print("TN: ", TN, " | FP: ", FP)
    print("FN: ", FN, " | TP: ", TP)

In [6]:
train_file = bz2.BZ2File('./input/train.ft.txt.bz2')
test_file = bz2.BZ2File('./input/test.ft.txt.bz2')

In [7]:
train_lines = train_file.readlines()
test_lines = test_file.readlines()

In [8]:
train_lines = [x.decode('utf-8') for x in train_lines[:20000]]
test_lines = [x.decode('utf-8') for x in test_lines[:4000]]

Due to limited computing capacity,we are truncating the dataset into 20000 training set and 4000 test set i.e. in ration 8:2

In [9]:
# Load from the file
train_x, train_y = splitReviewsLabels(train_lines)
test_x,test_y = splitReviewsLabels(test_lines)

100%|███████████████████████████████| 20000/20000 [00:00<00:00, 62887.97it/s]
100%|█████████████████████████████████| 4000/4000 [00:00<00:00, 68962.86it/s]


In [10]:
#find data distribution
pos=0
neg=0
for val in train_y:
    if val == 1:
        pos+=1
    else:
        neg+=1
print("Pos: ", pos)
print("Neg: ", neg)

Pos:  10257
Neg:  9743


let's see how are the reviews.

In [11]:
print(train_x[6])
print(train_y[6])

buyer beware: this is a self-published book, and if you want to know why--read a few paragraphs! those 0 star reviews must have been written by ms. haddon's family and friends--or perhaps, by herself! i can't imagine anyone reading the whole thing--i spent an evening with the book and a friend and we were in hysterics reading bits and pieces of it to one another. it is most definitely bad enough to be entered into some kind of a "worst book" contest. i can't believe amazon even sells this kind of thing. may
0


We need to encode the labels into numerical values since the values of the labels are categorical.

In [12]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [13]:
print(np.unique(train_y))


[0 1]


The next step is to calculate the probability of each class. Doing so we will obtain priors in the Bayes formula. It can be easily done by counting number of occurrences of each class divided by the total number of reviews.

In [14]:
pi=np.array([sum(train_y==0)/len(train_y),sum(train_y==1)/len(train_y)])
pi

array([0.48715, 0.51285])

We know that we can not feed text as string to the model, so we need a way to represent text as numerical value to the model. One of the easiest encoding that we can use is the Bag of Words (BoW) encoding. It takes into account words and their frequency of occurrence in the sentence.

The CountVectorizer function from sklearn.feature_extraction.text is the function that encode text using the BoW method. It basically returns the matrix of token counts.

In [15]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_x)
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x).todense()
xtest_count =  count_vect.transform(test_x).todense()

In [16]:
xtrain_count.shape

(20000, 41278)

The next step is to calculate the probability of occurrence of each word per class, i.e., likelihood. To do that we are going to create a dataframe that contain three columns: words,class1,and class2. The class1 column will be containing the likelihood for class1 (negative reviews) and class2 column is going to contain the likelihood for class2 (positive reviews).

However, one important technicality must be considered. The thing is that since some words might not occur in one of the classes, then the likelihood of those words will become zero. Having a zero likelihood for a word is very damaging to the model's performance because it causes the posterior to be zero which does not make sense. To elevate this problem a kind of smoothing which is called Laplace smoothing is used which is defined as follows.

$$ P(X_i \mid C_j) = \frac{count_{ij}+\alpha}{count_j+|V|+1} $$

In the above formula $ count_{ij} $ is the number of occurrences of word i in class j. $ count_j $ is total number of words in class j and |v| is the vocab size in class j.

In [17]:
wordFreq = pd.DataFrame(columns=['words','class1','class2'])
wordFreq['words'] = count_vect.get_feature_names()

x_train_class1 = xtrain_count[train_y==0]
x_train_class2 = xtrain_count[train_y==1]

count_class1 = np.sum(x_train_class1,axis=0)
count_class2 = np.sum(x_train_class2,axis=0)

vocab_size1 = len(np.where(count_class1==0)[1])
vocab_size2 = len(np.where(count_class2==0)[1])

alpha=10
count_class1 = np.array( (count_class1+alpha) /(np.sum(count_class1)+vocab_size1 +1))
count_class2 = np.array( (count_class2+alpha) /(np.sum(count_class2)+vocab_size2 +1))

wordFreq['class1'] = pd.Series(count_class1.ravel())
wordFreq['class2'] = pd.Series(count_class2.ravel())

Now it is time to iterate through all sentences for both train and validation data to calculate the accuracy on both sets.

In [18]:
train_preds = np.zeros(len(xtrain_count))
for i in range(len(xtrain_count)):
    idx = np.where(xtrain_count[i,:]!=0)[1]
    lh1 = wordFreq['class1'].iloc[idx].prod()
    lh2 = wordFreq['class2'].iloc[idx].prod()
    posterior1 = lh1*pi[0]
    posterior2 = lh2 * pi[1]

    if posterior1>posterior2:
        train_preds[i] = 0
    else:
        train_preds[i] = 1


matches = np.sum(train_y==train_preds)
print('Train accuracy is: '+str(matches/len(train_preds)))

Train accuracy is: 0.8814


In [19]:
# make confusion matrix
print(len(train_preds))
print(len(train_y))
conf_mat(train_preds, train_y)

20000
20000
TN:  8779  | FP:  964
FN:  1408  | TP:  8849


In [20]:
test_preds = np.zeros(len(xtest_count))
for i in range(len(xtest_count)):
    idx = np.where(xtest_count[i,:]!=0)[1]
    lh1 = wordFreq['class1'].iloc[idx].prod()
    lh2 = wordFreq['class2'].iloc[idx].prod()
    posterior1 = lh1*pi[0]
    posterior2 = lh2 * pi[1]

    if posterior1>posterior2:
        test_preds[i] = 0
    else:
        test_preds[i] = 1

    temp = 1

matches = np.sum(test_y==test_preds)
print('Validation accuracy is: '+str(matches/len(test_preds)))
conf_mat(test_preds, test_y)

Validation accuracy is: 0.8395
TN:  1652  | FP:  299
FN:  343  | TP:  1706



Validation accuracy is 0.8395, which is very promising given the simplicity of the model. Our objective was to learn naive bayes and implement it from scratch without library. we succeded in doing that.


However for improving performance there are more sophisticated methods like RNN,LSTM Transformers and so on