# Imports

In [1]:
import string
import numpy as np
import glob
import re
import pickle
import math
import operator
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# File readings

In [2]:
train_negative_path = "Dataset/train/neg/"  # path to positive training reviews
train_positive_path = "Dataset/train/pos/" # path to negative training reviews
test_negative_path = "Dataset/test/neg/"  # path to positive test reviews
test_positive_path = "Dataset/test/pos/" # path to negative test reviews
trainx = [] # list containing train movie reviews
trainy = [] # list containing train labels
testx = [] # list containing test movie reviews
testy = [] # list containing test labels

# function for reading negative and positive reviews into list
def readFiles(path, clas, signal):
    for file in glob.iglob(path + "/" + "*.txt"):
        with open(file, encoding='utf-8') as f:
            data = f.read()
            if signal == "train":
                trainx.append(data)
            else:
                testx.append(data)
            if clas == "neg" and signal == "train":
                trainy.append(0)
            elif clas == "pos" and signal == "train":
                trainy.append(1)
            elif clas == "neg" and signal == "test":
                testy.append(0)
            else:
                testy.append(1)

"""
instead of running these commands again and again I made pickle files for my ease which I load
below. However to check the correctness of this function, you can choose to run these calls by
uncommenting them
"""            
    
# readFiles(train_negative_path, "neg", "train")
# readFiles(train_positive_path, "pos", "train")
# readFiles(train_negative_path, "neg", "test")
# readFiles(train_positive_path, "pos", "test")

'\ninstead of running these commands again and again I made pickle files for my ease which I load\nbelow. However to check the correctness of this function, you can choose to run these calls by\nuncommenting them\n'

In [3]:
# loading pickle files created for train and test "un-processed" data

with open('trainx.pkl', 'rb') as f:
    trainx = pickle.load(f)
    
with open('trainy.pkl', 'rb') as f:
    trainy = pickle.load(f)
    
with open('testx.pkl', 'rb') as f:
    testx = pickle.load(f)
    
with open('testy.pkl', 'rb') as f:
    testy = pickle.load(f)

In [4]:
# reading stopwords
stop_words = [] # list containing stop words
stop_words_path = "Dataset/stop_words.txt"  # path to stop words
readingStopWords = open(stop_words_path, 'r')
for word in readingStopWords:
    word = word.strip('\n')
    stop_words.append(word)

# Preprocessing

In [5]:
# function for preprocessing
def preprocess(signal):
    if signal == "train":
        for i in range(len(trainx)):
            text = trainx[i]

            # lower casing
            text = text.lower()

            # removing stop words
            for word in stop_words:
                text = text.replace(" " + word + " ", " ")

            # removing html characters
            text = re.sub(re.compile('<.*?>'), '', text)

            # removing punctuations
            withoutPunct = ""
            for c in text:
                if c not in string.punctuation:
                    withoutPunct = withoutPunct + c
            text = withoutPunct

            trainx[i] = text
            
    else:
        for i in range(len(testx)):
            text = testx[i]

            # lower casing
            text = text.lower()

            # removing stop words
            for word in stop_words:
                text = text.replace(" " + word + " ", " ")

            # removing html characters
            text = re.sub(re.compile('<.*?>'), '', text)

            # removing punctuations
            withoutPunct = ""
            for c in text:
                if c not in string.punctuation:
                    withoutPunct = withoutPunct + c
            text = withoutPunct

            testx[i] = text

In [6]:
# now preprocessing
preprocess("train")
preprocess("test")

In [7]:
# converting to numpy arrays and visualizing shapes

trainx = np.array(trainx)
trainy = np.array(trainy)
testx = np.array(testx)
testy = np.array(testy)
    
print(trainx.shape)
print(trainy.shape)
print('')
print(testx.shape)
print(testy.shape)

(25000,)
(25000,)

(25000,)
(25000,)


# Bag of words and helper functions

## Creating bow

In [8]:
"""
this function creates bow and a dict containing bow words along with indexes
"""

def createBOW():
    bow = set() # set to store bag of words
    vocabulary = {} # dictionary to store bow words and their indexes

    for i in range(len(trainx)):
        current_text = trainx[i]
        current_text = current_text.split()
        for word in current_text:
            bow.add(word)
            
    print("Total distinct words in bow: ", len(bow))
    
    for loc, word in enumerate(sorted(list(bow))):
        vocabulary[word] = loc
        
    return bow, vocabulary

In [9]:
bow, vocabulary = createBOW()
bow = sorted(list(bow))

Total distinct words in bow:  142614


## Counting occurances

In [10]:
"""
This function counts number of occurances of "w" in bigdoc[c] were c belongs to one of the two
classes i.e. 1 and 0 in this case and w belongs to vocabulary (bag of words) generated in above cells
e.g. vocabulary (bow) belongs to ["example1", "example2", "example3"]
bigdoc[0] = ["movie bad", "worst was movie"]
bigdoc[1] = ["movie great", "awesome was movie"]
Hence we will count occurance of "example1" in bigdoc[0] and bigdoc[1] and so on
These occurances will be used in training algorithm of naive bayes
"""
def countOccurances(vocabulary):
    row, column, value = [], [], []
    for index, review in enumerate(trainx):
        review = review.split()
        counting = dict(Counter(review))
        
        for token, total in counting.items():
            if len(token) > 2:
                temp = vocabulary.get(token)
                if temp >= 0:
                    row.append(index)
                    column.append(temp)
                    value.append(total)
                    
    return csr_matrix((value, (row, column)), shape=(len(trainx), len(vocabulary)))

In [11]:
occurances = countOccurances(vocabulary)
print(occurances.shape)
occurances = occurances.toarray()

(25000, 142614)


## Generating bigdoc

In [12]:
"""
This helper function creates bigdoc[c] where c belongs to classes in our dataset i.e. 1 and 0 in this case
Basically it will append "d" for "d" belonging to our reviews in class "c"
The catch is instead of the reviews it will contain feature vector of that review
This bigdoc will be used in training algorithm of naive bayes
"""
def createBigDoc(occurances):
    bigdoc = {}
    neg = []
    pos = []
    for i in range(len(trainy)):
        if i <= 12499:
            neg.append(occurances[i])
        else:
            pos.append(occurances[i])
            
    bigdoc[0] = neg
    bigdoc[1] = pos
    return bigdoc

In [13]:
# generating bigdoc

bigdoc = createBigDoc(occurances)

In [14]:
print(len(bigdoc[0]))
print(len(bigdoc[1]))
print(type(bigdoc[0]))
print(type(bigdoc[1]))

12500
12500
<class 'list'>
<class 'list'>


In [15]:
# number of occurances of w in bigdoc[c] i.e. negative classes
# takes around 2 minutes to execute

negSumList = np.sum(bigdoc[0], axis=0)

In [16]:
# number of occurances of w in bigdoc[c] i.e. positive classes
# takes around 2 minutes to execute

posSumList = np.sum(bigdoc[1], axis=0)

In [17]:
# adding laplace (Add-1) smoothing

negSumListSmoothing = negSumList + 1
posSumListSmoothing = posSumList + 1

In [18]:
# this will basically go into the denominator while calculating loglikelihood

negSumListSum = np.sum(negSumListSmoothing)
posSumListSum = np.sum(posSumListSmoothing)

# Part 1

## Training

In [19]:
"""
This function trains the naive bayes classifier
D : Total number of documents i.e. the reviews
C : Total number of classes. Two in this case i.e. positive (1) and negative (0)
returns log P(c), log P(w|c) and vocabulary of D
"""

def trainNaiveBayes(D, C):
    logprior = {}
    bigdoc = {}
    loglikelihood = {}
    
    for c in C: # calculating P(c) terms
        
        Ndoc = len(D) # number of documents in D (25000 in this particular dataset provided to us)
        
        Nc = 0 # number of documents from D in class c
        for labels in trainy:
            if labels == c:
                Nc = Nc + 1
                
        logprior[c] = math.log10(Nc / Ndoc)
        
        V = bow # vocabulary of D
        
        # now append(d) for d belonging to D with class c to create big doc of class c
        # bigdoc already created in above cells
        if c == 0:
            mybigdoc = negSumList
        else:
            mybigdoc = posSumList
        
        for i in range(len(V)): # calculating P(w|c) terms
            
            # counting occurances of w in bigdoc[c] -> count(w, c)
            currentword = V[i]
            count_w_c = mybigdoc[i]
            
            if c == 0:
                totalSum = negSumListSum
            else:
                totalSum = posSumListSum
            
            # laplace (Add-1) smoothing has already been applied in above cells
            loglikelihood[(currentword, c)] = math.log10((count_w_c + 1) / (totalSum))
            
    return logprior, loglikelihood, V

### Now training

In [20]:
D = trainx
C = [0, 1]
logprior, loglikelihood, V = trainNaiveBayes(D, C)

## Testing

In [21]:
"""
This function is for prediction
V : Vocabulary of D i.e. the train data
C : Total number of classes. Two in this case i.e. positive (1) and negative (0)
testdoc: the review whose class is to be predicted
rest are just parameters returned from train function above
returns best c
"""

def testNaiveBayes(testdoc, logprior, loglikelihood, C, V):
    summ = {}
    for c in C:
        summ[c] = logprior[c]
        for word in testdoc:
            # filtered out the words not in V in below cell so we dont need to check here
            summ[c] = summ[c] + loglikelihood[(word, c)]
    return max(summ.items(), key=operator.itemgetter(1))[0] # basically argmaxc sum[c]

### Now predicting and calculating accuracy

In [22]:
# just processing test data and removing words that are not in our vocabulary

processedTestx = [] # this does not contain words not in our vocabulary
mySet = set(V)

for i in range(len(testx)):
    temp = []
    testdoc = testx[i]
    testdoc = testdoc.split()
    for word in testdoc:
        if word in mySet:
            temp.append(word)
    processedTestx.append(temp)

In [23]:
# now predicting and calculating accuracy

acc = 0 # accuracy
for i in range(len(processedTestx)):
    testdoc = processedTestx[i]
    predicted = testNaiveBayes(testdoc, logprior, loglikelihood, C, V)
    if predicted == testy[i]:
        acc = acc + 1

In [24]:
acc = acc / len(testy)
print("Accuracy on test set: ", acc * 100, "%")

Accuracy on test set:  93.136 %


# Part 2

In [25]:
# creating bag-of-words representation using count vectorizer

vectorizer = CountVectorizer()
Xtrain = vectorizer.fit_transform(trainx)
Xtest = vectorizer.transform(testx)

In [26]:
# training
# alpha means smoothing according to docs

clf = MultinomialNB(alpha=1)
clf.fit(Xtrain, trainy)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [27]:
# testing

pred = clf.predict(Xtest)

In [28]:
# calculating accuracy

print('Accuracy on test set: ', accuracy_score(testy, pred) * 100, '%')

Accuracy on test set:  93.016 %


In [29]:
# calculating confusion matrix

confusion_matrix(testy, pred)

array([[11871,   629],
       [ 1117, 11383]], dtype=int64)