In [38]:
# importing modules that are required in this project
import os
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string
from math import log
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [39]:
x=[] # storing document in this x array   
y=[] # storing category of document in this y array   

for category in os.listdir("C:\\Users\\SONY\\Downloads\\20_newsgroups"):
    for document in os.listdir("C:\\Users\\SONY\\Downloads\\20_newsgroups\\"+category):
        with open("C:\\Users\\SONY\\Downloads\\20_newsgroups\\"+category+'\\'+document, "r") as f:
            x.append(f.read())
            y.append(category)

In [40]:
# this function is required to extract feature for dataset

def feature_extraction(data):
    
    dictionary={}
    stop_words= list( set(stopwords.words('english')) ) + list(string.punctuation)
    
    for row in data:
        
        words=nltk.word_tokenize(row) 
        
        for word in words:
            
            word=word.lower()
            
            if(word in stop_words):
                continue
                
            if(word in dictionary):
                
                dictionary[word]=dictionary[word]+1
            
            else:
                
                dictionary[word]=1
    
    a=[]
    
    for feature in dictionary:
        a.append((dictionary[feature],feature))
    
    a.sort(reverse=True)
    
    
    Features=[]
    
    for i in range(2000):
        
        Features.append(a[i][1])
    
    return Features

In [41]:
# this is self implemented multinomial naive bayes fit function
# it returns the dictionary

def fit(x,y,features):
    
    dictionary={}
    dictionary["total_rows"]=len(y)
    classes=set(y)
    
    for target_class in classes:
        
        dictionary[target_class]={}
        dictionary[target_class]["total_rows"]=y.count(target_class)
        dictionary[target_class]["total_words"]=0
        
        for feature in features:
            
            dictionary[target_class][feature]=0
        
    
    for row_number in range(len(x)):
        
        
        words=nltk.word_tokenize(x[row_number])
        current_class=y[row_number]
        
        for word in words:
            
            word=word.lower()
            
            if(word not in features):
                continue
            
            dictionary[current_class][word]=dictionary[current_class][word]+1 
            dictionary[current_class]["total_words"]=dictionary[current_class]["total_words"]+1
    
    
    return dictionary

In [42]:
# this function finds the probablity that words belongs to the target class
# this function also uses laplace correction

def probability(words,target_class,dictionary):
    
    prob=log(dictionary[target_class]["total_rows"]/dictionary["total_rows"])
    
    for word in words:
        
        word=word.lower()
        
        if(word not in dictionary[target_class]):
            continue
        
        num=dictionary[target_class][word]+1
        den=dictionary[target_class]["total_words"]+len(dictionary[target_class].keys())-2
        
        prob=prob+log(num/den)
    
    return prob    

In [43]:
# this function return the prediction for testing data 

def predict(x,dictionary):
    
    predictions=[]
    
    for row in x:
        
        words=nltk.word_tokenize(row)
        best_probability=-1000
        best_class=""
        oneloop=True
        
        for target_class in dictionary.keys():
            
            if(target_class=="total_rows"):
                continue
            
            prob=probability(words,target_class,dictionary)
            
            if(oneloop==True or prob>best_probability):
                best_probability=prob
                best_class=target_class
            
            oneloop=False
            
        predictions.append(best_class)
    
    return predictions

In [44]:
# this function create 2D dataset for sklearn implemented multinomial naive bayes

def create_dataset(x,features):
    
    dataset=[[ 0 for __ in range( len(features) ) ] for _ in range( len(x) ) ]
    
    for row_num in range(len(x)):
        
        words=nltk.word_tokenize(x[ row_num ]) 
        
        for word in words:
            
            word=word.lower()
            
            if(word not in features):
                continue 
            
            position=features.index(word)
            
            dataset[row_num][position]=dataset[row_num][position]+1
        
    
    return dataset

In [45]:
# splitting the dataset 

x_train,x_test,y_train,y_test=train_test_split(x,y)

In [46]:
# extracting features

features=feature_extraction(x_train)

In [49]:
# creating dataset and finding classification report for sklearn implemented naive bayes

X_train=create_dataset(x_train,features)
X_test=create_dataset(x_test,features)

clf = MultinomialNB()

clf.fit(X_train,y_train)

Predictions=clf.predict(X_test)

print(classification_report(y_test,Predictions))

                          precision    recall  f1-score   support

             alt.atheism       0.71      0.75      0.73       256
           comp.graphics       0.69      0.83      0.75       230
 comp.os.ms-windows.misc       0.62      0.04      0.08       233
comp.sys.ibm.pc.hardware       0.60      0.81      0.69       276
   comp.sys.mac.hardware       0.76      0.90      0.83       259
          comp.windows.x       0.72      0.72      0.72       246
            misc.forsale       0.78      0.89      0.83       263
               rec.autos       0.80      0.88      0.84       256
         rec.motorcycles       0.82      0.94      0.88       239
      rec.sport.baseball       0.86      0.92      0.89       222
        rec.sport.hockey       0.96      0.82      0.89       239
               sci.crypt       0.94      0.91      0.92       277
         sci.electronics       0.78      0.88      0.83       236
                 sci.med       0.95      0.84      0.89       250
         

In [50]:
# this is self implemented multinomial naive 
# fitting and prediction for test data and comparing the classification report with sklearn 
# implemented

dictionary=fit(x_train,y_train,features)

predictions=predict(x_test,dictionary)

print(classification_report(y_test,predictions))

                          precision    recall  f1-score   support

             alt.atheism       0.71      0.75      0.73       256
           comp.graphics       0.69      0.83      0.75       230
 comp.os.ms-windows.misc       0.62      0.04      0.08       233
comp.sys.ibm.pc.hardware       0.60      0.81      0.69       276
   comp.sys.mac.hardware       0.76      0.90      0.83       259
          comp.windows.x       0.72      0.72      0.72       246
            misc.forsale       0.78      0.89      0.83       263
               rec.autos       0.80      0.88      0.84       256
         rec.motorcycles       0.82      0.94      0.88       239
      rec.sport.baseball       0.86      0.92      0.89       222
        rec.sport.hockey       0.96      0.82      0.89       239
               sci.crypt       0.94      0.91      0.92       277
         sci.electronics       0.78      0.88      0.83       236
                 sci.med       0.95      0.84      0.89       250
         

In [None]:
# this is the function which validates the word 
# but this project doesnot use this beacause time will increase 
# it increases the accuracy to 83 %

def validate_word(word):
    
    punctuation=string.punctuation+"1234567890"
    
    new_word=""
    a=[]
    
    for x in range(len(word)):
        
        if(word[x] in punctuation):
            a.append(x)
    
    if(len(a)>2):
        return ""
    
    for x in range(len(word)):
        
        if(x not in a):
            new_word+=word[x]
    
    return new_wordz