In [31]:
# Data preparation
import numpy as np
import pandas as pd
import os

folder_path = r'E:\20_newsgroups'
folders = os.listdir(folder_path) #gives list of all the folders in this path

data = {}  #stores folder names as keys and documents as values i.e{folder1:[doc1,doc2..] , folder2:[doc1,doc2..]}
for folder in folders:
    data[folder] = []
    for doc in os.listdir(os.path.join(folder_path,folder)):
        opened_doc = open(os.path.join(folder_path,folder,doc),'r') #not exactly storing name of document but contents of doc
        data[folder].append(opened_doc.read())

from nltk.corpus import stopwords #importing stopwords from nltk
from string import punctuation #importing punctuation
stopword = stopwords.words('english')
stopword += punctuation # list of both nltk stopwords and punctuation

import re
def ispure(string): #this function checks if word contains any number or special character which is of no use
    check = re.compile('[@_.!#$%^&*()<>-?/\|}{~:1234567890]')
    if check.search(string) == None:
        return True
    else:
        return False

freq_map = {} # storing frequency of each word as values and word as keys
for i in range(len(data)):
    for doc in data[folders[i]]:
        for word in doc.split():
            if word.lower() not in stopword and len(word)>=5 and ispure(word):
                freq_map[word.lower()] = freq_map.get(word.lower(),0)+1
                
import operator
sorted_freq_map = sorted(freq_map.items(),key=operator.itemgetter(1),reverse=True) # reverse sorting words on the basis of frequency 
features = [key[0] for key in sorted_freq_map] #picking only words from tuple of words and freq returned from above sorted function
feature_list = features[0:2500] # picking top 2500 words

Y = []
for i in range(len(data)): #creating class labels
    for doc in data[folders[i]]:
        Y.append(folders[i])
Y = np.array(Y)

matrix = np.zeros(shape=(len(Y),len(feature_list))) #making a matrix of required size to be converted to X
df = pd.DataFrame(data=matrix,columns=feature_list) #converting np array to dataframe

i=0 #iterator to access each row of dataframe
for folder in data: # updating frequency of each word in each document represented by rows in dataframe
    for doc in data[folder]:
        for word in doc.split():
            if word.lower() in feature_list:
                df[word.lower()][i] += 1
        i+=1

In [32]:
# Naive bayes classifier implementation
class NBclassifier:
    def __init__(self):
        self.__result = {} #stores final dictionary with all the counts
        
    def fit(self,x,y):
        feature_names = x.columns #getting the feature names
        x = x.values #converting dataframe to np array
        self.__result['total_data'] = len(y) #storing total datapoints
        class_values = set(y) #getting unique classes
        for current_class in class_values: #for each class
            self.__result[current_class] = {} #another dictionary against each class
            curr_class = (y==current_class)
            x_curr_class = x[curr_class] 
            y_curr_class = y[curr_class]
            total_words = 0
            for i in range(len(feature_names)): #for each feature
                curr_sum = x_curr_class[:,i].sum() #sum of frequency of word for each class
                self.__result[current_class][feature_names[i]] = curr_sum
                total_words += curr_sum
            self.__result[current_class]['total_count'] = total_words
        return self.__result
    
    def predict(self,x):
        y_pred = []
        feature_names = x.columns
        x = x.values #converting dataframe to np array
        for row in x: #for each row
            x_class = self.__predict_single_class(row,feature_names)
            y_pred.append(x_class)
        return y_pred
    
    def __predict_single_class(self,row,feature_names):
        first_run = True 
        best_p = -np.inf
        best_class = -np.inf
        classes = self.__result.keys() 
        for curr_class in classes:
            if curr_class == 'total_data': #  as dictionary has one extra key storing total data points
                continue
            p_curr_class = self.__probability(row,curr_class,feature_names)
            if first_run or p_curr_class > best_p: #getting most probable class, document(row) belongs to
                best_p = p_curr_class
                best_class = curr_class
            first_run = False
        return best_class
    
    def __probability(self,row,curr_class,feature_names):
        output = np.log(self.__result[curr_class]['total_count']) - np.log(self.__result['total_data'])
        for i in range(len(feature_names)): #for each feature
            curr_count = self.__result[curr_class][feature_names[i]] + 1 #laplace correction
            total_count = self.__result[curr_class]['total_count'] + len(feature_names) #laplace correction
            curr_probability = np.log(curr_count) - np.log(total_count)
            for j in range(int(row[i])): # ignoring the word where frequency is 0 for any particular document
                output += curr_probability
        return output
    
    def score(self,x,y): #calculates mean accuracy
        y_pred = self.predict(x)
        count = 0
        for i in range(len(y)):
            if y_pred[i] == y[i]:
                count += 1
        return count/len(y)

In [33]:
# Testing and comparing with sklearn
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df,Y)

from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train,y_train)
y_pred_sk = clf.predict(x_test)
sklearn_score = clf.score(x_test,y_test)
print('Sklearn score = ',sklearn_score)
print('-----------------------------------------------')

clf2 = NBclassifier()
clf2.fit(x_train,y_train)
y_pred_our = clf2.predict(x_test)
our_score = clf2.score(x_test,y_test)
print()
print('Our score = ',our_score)
print('------------------------------------------------')
from sklearn.metrics import classification_report
print('Sklearn classification report')
print()
print(classification_report(y_pred_sk,y_test))
print('------------------------------------------------')
print('Our classification report')
print()
print(classification_report(y_pred_our,y_test))

Sklearn score =  0.7038
-----------------------------------------------

Our score =  0.703
------------------------------------------------
Sklearn classification report

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.59      0.65       314
           comp.graphics       0.59      0.61      0.60       241
 comp.os.ms-windows.misc       0.74      0.58      0.65       293
comp.sys.ibm.pc.hardware       0.65      0.53      0.58       279
   comp.sys.mac.hardware       0.61      0.65      0.63       227
          comp.windows.x       0.65      0.78      0.71       203
            misc.forsale       0.75      0.69      0.72       281
               rec.autos       0.72      0.68      0.70       260
         rec.motorcycles       0.75      0.67      0.71       263
      rec.sport.baseball       0.76      0.83      0.80       243
        rec.sport.hockey       0.87      0.88      0.87       229
               sci.crypt       0.86