# Table of Contents
- [**1. Multinomial Naive Bayes Classifier**](#1.-Multinomial-Naive-Bayes-Classifier)
   - [**1.1 Preprocessing functions**](#1.1-Preprocessing-functions)
   - [**1.2 Fit Function**](#1.2-Fit-Function)
   - [**1.3 Prediction Functions**](#1.3-Prediction-Functions)
- [**2. Preprocessing Data**](#2.-Preprocessing-Data)
- [**3. Training Data**](#3.-Training-Data)
- [**4. Testing Data**](#4.-Testing-Data)
- [**5. Cross Check With Sklearn Classifier**](#5.-Cross-Check-With-Sklearn-Classifier)
- [**6. Accuracy Comparison Between Self Implemented And Sklearn Classifier**](#6.-Accuracy-Comparison-Between-Self-Implemented-And-Sklearn-Classifier)

# 1. Multinomial Naive Bayes Classifier

In [38]:
from os import listdir
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import operator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1.1 Preprocessing functions

In [39]:
def train_test_split_dir(path):
    """This creates a dictionary which separates the directory files into 
    two dictionaries, namely, train & test directory"""
    
    class_directory = [file for file in listdir(path)] # Load directory folders
    data_directory = {}
    for dir_ in class_directory:
        data_directory[dir_] = [file for file in listdir(path + '/' + dir_)] # Load directory files
    data_directory_split_train = {}
    data_directory_split_test = {}
    for dir_ in data_directory: # Split into train & test dictionary
        train_dr,test_dr = model_selection.train_test_split(data_directory[dir_], shuffle=False)
        data_directory_split_train[dir_] = train_dr
        data_directory_split_test[dir_] = test_dr
    return data_directory_split_train,data_directory_split_test

In [40]:
def vocabulary(train_dir, path):
    """This function generates the vocabulary list from all the files. This function limits to the top
    2000 words"""
    
    vocabulary_dict = {}
    stop_words = stopwords.words('english') # Stopwords to exclude
    block_words = ['newsgroups:', 'xref', 'path', 'from:', 'subject:', 'sender', 'organisation', 'apr','gmt', 
               'last','better','never','every','even','two','good','used','first','need','going','must',
               'really','might','well','without','made','give','look','try','far','less','seem','new','make',
               'many','way','since','using','take','help','thanks','send','free','may','see','much','want','find',
               'would','one','like','get','use','also','could','say','us','go','please','said','set','got','sure',
               'come','lot','seems','able','anything','put', '--', '|>', '>>', '93', 'xref', 'cantaloupe.srv.cs.cmu.edu',
               '20', '16', "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'", '21', '19', '10', '17', '24', 
               'reply-to:', 'thu', 'nntp-posting-host:', 're:','25''18'"i'd"'>i''22''fri,''23''>the','references:','xref:',
               'sender:','writes:','1993','organization:','message-id:','lines:','i\'m','distribution:','i\'ve','can\'t',
               '>in','>the','that\'s','>i','it','i\'d','...','>|>','---','keywords:','followup-to:','//']
    # Additional block words to be excluded
    for dir_ in train_dir: # Adding words to the dictionary
        for i in range(len(train_dir[dir_])):
            current_path = path + '/' + dir_ + '/' + train_dir[dir_][i]
            data_file = open(current_path,'r').read()
            for word in data_file.split():
                if len(word)!=1:
                    if (word.lower() not in stop_words) and (word.lower() not in block_words):
                        if vocabulary_dict.get(word.lower())!=None:
                            vocabulary_dict[word.lower()]+=1
                        else:
                            vocabulary_dict[word.lower()] = 1
    sorted_vocabulary = sorted(vocabulary_dict.items(), key=operator.itemgetter(1), reverse=True)
    vocabulary = []
    index = 0
    for word,freq in sorted_vocabulary:
        vocabulary.append(word)
        index+=1
        if index==2000:
            break
    return vocabulary

In [41]:
def structured_dataset(vocabulary,data_dir):
    """Return a 2D array(input) and a 1D array(ouput)"""
    
    x = pd.DataFrame(columns = vocabulary)
    y = []
    for dir_ in data_dir:
        for i in range(len(data_dir[dir_])):
            index = len(x)
            x.loc[index] = np.zeros(len(vocabulary))
            y.append(dir_)
            current_path = path + '/' + dir_ + '/' + data_dir[dir_][i]
            data_file = open(current_path,'r').read()
            for word in data_file.split():
                if word.lower() in vocabulary:
                    x.loc[index][word.lower()]+=1
    x=x.values
    return x,np.array(y)

# 1.2 Fit Function

In [42]:
def fit(X_train, Y_train):
    """This function creates dictionary which is trained on the dataset provided"""
    
    result = {}
    class_values = set(Y_train)
    result["total_data"] = len(Y_train)
    for current_class in class_values:
        result[current_class] = {}
        current_class_rows = (Y_train == current_class)
        X_train_current = X_train[current_class_rows]
        Y_train_current = Y_train[current_class_rows]
        num_features = X_train.shape[1]
        Sum = 0
        for j in range(1, num_features + 1):
            result[current_class][j] = (X_train_current[:, j - 1]).sum()
            Sum+=result[current_class][j]
        result[current_class]["total_count"] = Sum
    return result

# 1.3 Prediction Functions

In [43]:
def probability(dictionary, x, current_class):
    """This prints the bayes probability of each row with laplace corrections applied. This ouputs
    a logarithmic probability"""
    
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    num_features = len(dictionary[current_class].keys()) - 1;
    for j in range(1, num_features + 1):
        count_j_feature = dictionary[current_class][j] + 1
        count_current_class = dictionary[current_class]["total_count"] + num_features
        current_j_probablity = np.log(count_j_feature) - np.log(count_current_class)
        for freq in range(int(x[j-1])):
            output = output + current_j_probablity
    return output

In [44]:
def predictSinglePoint(dictionary, x):
    """This predicts the class for each row and stores the class with highest probability"""
    
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [45]:
def predict(dictionary, X_test):
    """This function predicts the value for the input dataset and returns an array"""
    
    y_pred = []
    for x in X_test:
        x_class = predictSinglePoint(dictionary, x) # Predict class for each data row
        y_pred.append(x_class)
    return y_pred

# 2. Preprocessing Data

In [46]:
# Transform datafiles from different folders to store in an array which can be passed to classifier
path='./20_newsgroups'
train_directory,test_directory=train_test_split_dir(path)
dataset_vocabulary=vocabulary(train_directory, path)

# 3. Training Data

In [47]:
x_train,y_train=structured_dataset(dataset_vocabulary, train_directory)
clf1=fit(x_train, y_train)

# 4. Testing Data

In [48]:
x_test,y_test=structured_dataset(dataset_vocabulary, test_directory)
y_pred=predict(clf1, x_test)

In [49]:
print(confusion_matrix(y_test, y_pred)) # Prints confusion matrix

[[184   2   1   0   0   0   1   3   1   2   1   0   4   5   3   6   2   3
    3  29]
 [  3 193  11   3   6   8   5   1   2   2   1   2   7   3   2   1   0   0
    0   0]
 [  0   8 206   7   0   7   5   1   3   0   3   0   9   0   1   0   0   0
    0   0]
 [  0   4  11 193   9   0  17   5   1   0   0   0  10   0   0   0   0   0
    0   0]
 [  0   1   7   6 215   0   4   6   2   1   0   0   2   2   4   0   0   0
    0   0]
 [  0  19  33   4   3 183   2   0   0   2   0   0   4   0   0   0   0   0
    0   0]
 [  0   3   0   9   3   3 213   5   1   2   0   1   5   2   2   0   0   0
    0   1]
 [  0   1   0   0   0   0   6 234   3   1   0   0   2   0   1   0   0   0
    2   0]
 [  0   0   0   0   0   0   3   5 239   0   0   0   2   0   0   0   0   0
    1   0]
 [  0   0   0   1   0   0   4   0   1 238   5   0   1   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   2   0   3   5 233   0   0   0   2   1   1   0
    2   1]
 [  0   6   2   0   3   2   1   1   4   1   2 202  13   2   1   0

In [50]:
print(classification_report(y_test, y_pred)) # Prints classification report for testing dataset

                          precision    recall  f1-score   support

             alt.atheism       0.62      0.74      0.67       250
           comp.graphics       0.73      0.77      0.75       250
 comp.os.ms-windows.misc       0.72      0.82      0.77       250
comp.sys.ibm.pc.hardware       0.84      0.77      0.81       250
   comp.sys.mac.hardware       0.87      0.86      0.87       250
          comp.windows.x       0.89      0.73      0.80       250
            misc.forsale       0.76      0.85      0.80       250
               rec.autos       0.82      0.94      0.88       250
         rec.motorcycles       0.87      0.96      0.91       250
      rec.sport.baseball       0.90      0.95      0.92       250
        rec.sport.hockey       0.93      0.93      0.93       250
               sci.crypt       0.94      0.81      0.87       250
         sci.electronics       0.74      0.80      0.77       250
                 sci.med       0.86      0.80      0.83       250
         

# 5. Cross Check With Sklearn Classifier

In [51]:
clf2 = MultinomialNB()
clf2.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [52]:
y_pred2 = clf2.predict(x_test)
print(confusion_matrix(y_test, y_pred2)) # Prints confusion matrix

[[184   2   1   0   0   0   1   3   2   2   1   0   4   5   3   6   1   2
    3  30]
 [  3 191  11   3   6   8   5   1   2   2   2   1   8   4   2   1   0   0
    0   0]
 [  0   8 207   7   0   7   8   1   2   0   1   0   8   0   1   0   0   0
    0   0]
 [  0   4  10 192   9   0  19   4   2   0   0   0  10   0   0   0   0   0
    0   0]
 [  0   1   7   6 213   0   6   6   2   1   0   0   3   1   4   0   0   0
    0   0]
 [  0  19  32   4   3 182   4   0   0   2   0   0   4   0   0   0   0   0
    0   0]
 [  0   2   0   9   3   3 218   5   1   1   0   1   2   2   2   0   0   0
    0   1]
 [  0   1   0   0   0   0   8 231   4   1   0   0   2   0   1   0   0   0
    2   0]
 [  0   0   0   0   0   0   3   5 239   0   0   0   2   0   0   0   0   0
    1   0]
 [  0   0   0   1   0   0   4   0   1 238   5   0   1   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   2   0   3   5 233   0   0   0   2   1   1   0
    2   1]
 [  0   7   2   0   4   2   3   1   5   0   1 199  14   2   1   0

In [53]:
print(classification_report(y_test, y_pred2)) # Prints classification report for testing dataset

                          precision    recall  f1-score   support

             alt.atheism       0.62      0.74      0.67       250
           comp.graphics       0.73      0.76      0.75       250
 comp.os.ms-windows.misc       0.72      0.83      0.77       250
comp.sys.ibm.pc.hardware       0.85      0.77      0.81       250
   comp.sys.mac.hardware       0.87      0.85      0.86       250
          comp.windows.x       0.89      0.73      0.80       250
            misc.forsale       0.71      0.87      0.79       250
               rec.autos       0.82      0.92      0.87       250
         rec.motorcycles       0.84      0.96      0.89       250
      rec.sport.baseball       0.90      0.95      0.93       250
        rec.sport.hockey       0.94      0.93      0.94       250
               sci.crypt       0.95      0.80      0.87       250
         sci.electronics       0.74      0.80      0.77       250
                 sci.med       0.86      0.79      0.82       250
         

# 6. Accuracy Comparison Between Self Implemented And Sklearn Classifier

In [54]:
print("Self Implemented Naive Bayes Accuracy: ", accuracy_score(y_test, y_pred))
print("Sklearn Naive Bayes Accuracy: ", accuracy_score(y_test, y_pred2))

Self Implemented Naive Bayes Accuracy:  0.8104
Sklearn Naive Bayes Accuracy:  0.8068
