In [None]:
def read_corpus(corpus_file):
    """ This function reads the file in the location specified by string 
    `corpus_file` and returns a list of tuples (list of words in text, label)
    """
    out = []
    with open(corpus_file) as f:
        for line in f:
            tokens = line.strip().split()
            out.append((' '.join(tokens[3:]), tokens[1]))
    return out

In [None]:
corpus = read_corpus('./Datasets/all_sentiment_shuffled.txt')
print("Example:\n", " Text: ", corpus[0][0], "\n  Label: ", corpus[0][1])
print("Total number of documents =", len(corpus))

In [None]:
# Importing requisite libraries
import numpy as np
import re
import nltk
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download('wordnet')

# Lemmatization vs Stemming

In [None]:
# I am using stemming intead of lemmatization due to the better results as below
# This is just an example
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
tmp = [lemmatizer.lemmatize(w) for w in ['loving','stepped','running','ran']]
tmp = [lemmatizer.lemmatize(w,pos='a') for w in ['loving','stepped','running','ran']]
print("Output for Lemmatization : " ,tmp)
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
print("Output for Stemming : ",[ps.stem(word) for word in ['loving','stepped','running','ran']])

In [None]:
# PREPROCESSING FUNCTIONS

# Removes all the punctuations present in the document
def remove_punctuation(doc):
    # This removes all letters except range a-z and A-Z and replaces them with a space
    proc_doc = re.sub('[^a-zA-Z]',' ',doc)
    # Lowercasing the words
    proc_doc = proc_doc.lower()
    # Splitting the sentece into word list
    proc_doc = proc_doc.split()
    return proc_doc 


def remove_stopwords(doc):
    # Removes words like 'if', 'he', 'she', 'the', etc which never belongs to any topic
    proc_doc = [word for word in doc if not word in set(stopwords.words('english'))]
    return proc_doc

# lemmatizer is a transformers which transforms the word to its singular, present-tense form
def lemmatize(doc):
    # NOTE : Using Stemmer instead of Lemmatizer
    ps = PorterStemmer()
    proc_doc = [ps.stem(word) for word in doc]
    return proc_doc 
    
def preprocess(doc):
    """ Function to preprocess a single document
    """
    assert isinstance(doc, str)   # assert that input is a document and not the corpus
    processed_doc = remove_punctuation(doc)
    processed_doc = remove_stopwords(processed_doc)
    processed_doc = lemmatize(processed_doc)
    return processed_doc

In [None]:
# PREPROCESSING STEPS

proc_doc = []; y = [] 
for i in range(len(corpus)):
    proc_doc.append(preprocess(corpus[i][0]))
    y.append(corpus[i][1]=='pos')

In [None]:
# Support Functions and Variables

def list_str2list_word(list_str):
    """ Function to convert list of strings to a list of words
    """
    list_words = []
    for string in list_str:
        for word in string:
            list_words.append(word)
    return list_words

# Implementing the Naive-Bayes Classifier

In [None]:
class Naive_Bayes:
    """ Class to implement the Naive-Bayes model on the given corpus
    
    Attributes:
    
    P_cj: List store the probablities of all classes in y_train
    Pos_prob : List to store the conditional probablities of positive reviews
    Neg_prob : List to store the conditional probablities of negative reviews
    Vocab_Dict : Dictionary to store Vocabulary and corresponding indexes 
    
    Functions:
    train_nb : To train on the training documents and storing the probablities
    
    classify_nb :Classifies the test reviews using the conditional probablities 
                 and returns an array of predictions
    
    """
    
    def __init__(self): 
        self.P_cj = []
        self.Pos_prob = []
        self.Neg_prob = []
        self.Vocab_Dict = {}
        
    def train_nb(self,training_documents):
        
        # Converting the training documents to a bag of words
        Tot_list = list_str2list_word(training_documents['Proc_Review'])
        
        # Learning Vocabulary and toring in Vocab_Dict
        Vocabulary = list(set(Tot_list)); V = len(Vocabulary)
        Vocabulary_Dict = {}
        for i in range(V):
            Vocabulary_Dict[Vocabulary[i]] = i
        self.Vocab_Dict = Vocabulary_Dict
        
        # Making y_train binary(0,1) from Boolean Values
        y_train = np.multiply(training_documents['Nature'],1)
        
        # Finding out probablities of all unique classes in y_train
        P_cj = []
        for cj in np.unique(y_train):
            P_cj.append(np.unique(y_train,return_counts=True)[1][cj])
        self.P_cj = P_cj/sum(P_cj)

        # Storing the positive and negative reviews in two lists 
        Pos_list = []; Neg_list = []
        for i in range(len(y_train)):
            if(y_train[i]):  Pos_list = Pos_list+training_documents['Proc_Review'][i]
            else: Neg_list = Neg_list+training_documents['Proc_Review'][i]

        Pos_prob = np.zeros(V).tolist() 
        Neg_prob = np.zeros(V).tolist()

        # COmputing the conditional probablities of words in both lists
        for word in Pos_list:
            Pos_prob[self.Vocab_Dict[word]] += 1
        for word in Neg_list:
            Neg_prob[self.Vocab_Dict[word]] += 1
        
        # Storing class attributes Po_prob and Neg_prob
        self.Pos_prob = (Pos_prob+np.ones(V))/(sum(Pos_prob)+V)
        self.Neg_prob = (Neg_prob+np.ones(V))/(sum(Neg_prob)+V)
        pass

    def classify_nb(self,test_documents):
        # returns the guess of the classifier
        y_pred = np.zeros(len(test_doc),dtype='int')
        n_iter = 0
        # Iterating through the test documents to find word matches 
        for string in test_documents:
            P = np.log(self.P_cj[1]); N = np.log(self.P_cj[0])
            for word in string:
                if word in model.Vocab_Dict.keys():
        # Computing total log probablities (since actual probablities are too small) 
                    P += np.log(self.Pos_prob[self.Vocab_Dict[word]])
                    N += np.log(self.Neg_prob[self.Vocab_Dict[word]])
        # Comparing log probablities and returing predictions
            if(P<N): y_pred[n_iter] = 0
            else: y_pred[n_iter] = 1
            n_iter += 1
        return y_pred    

# Train-Test Splitting and Predicting...

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data into trainand test documents
train_doc, test_doc, y_train, y_test = train_test_split(proc_doc, y, test_size = 0.20, random_state = 42)
# Making the training documents into a dataset
training_documents = pd.DataFrame(data = (train_doc,y_train)).transpose()
training_documents.columns = ['Proc_Review','Nature']
# Training the Naive Bayes model
model = Naive_Bayes()
model.train_nb(training_documents)
# Making y_test binary(0,1) from Boolean Values
y_test = np.multiply(y_test,1)
# Finding model predictions
y_pred = model.classify_nb(test_doc)

In [None]:
from sklearn.metrics import confusion_matrix

def accuracy(y_test,y_pred,no_catgories=2):
    """ Function to print the confusion matrix and accuracy given y_test and y_pred
    """
    cm = confusion_matrix(y_test, y_pred)
    print("The confusion matrix for the above classification problem :\n", cm)

    #Computing accuracy
    accuracy = sum(cm[i,i] for i in range(no_catgories))/(len(y_test))
    print("The accuracy of the model is : "+str(accuracy*100)+'%\n')

In [None]:
# Printing the confusion matrix and accuracy for model
accuracy(y_test,y_pred)