In [1]:
import numpy as np
from sklearn import datasets    # Importing dataset module from Sklearn
categories=['alt.atheism','comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware']
traindata=datasets.load_files(r'C:\Users\KUNAL\Downloads\20news-bydate (1).tar\20news-bydate (1)\20news-bydate-train',categories=categories,encoding='ISO-8859-1')   #Training Dataset load by load_files method
test_data=datasets.load_files(r'C:\Users\KUNAL\Downloads\20news-bydate (1).tar\20news-bydate (1)\20news-bydate-test',categories=categories,encoding='ISO-8859-1') 
#Testing Dataset Load by load_files method
train=traindata.data

In [2]:
#This function cleans the data by removing all stopwords and punctuation from document
def clean_data(a):
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    import string
    stop_words = set(stopwords.words('english'))      #In stop_words variable storing all unique stopwords
    d=""
    final_sentence=[]
    for w in range(len(a)):
        c=word_tokenize(a[w])         #this will make a seprate string of each element in a document at given index
        for j in c:
            if j.lower() not in stop_words and j not in string.punctuation:
                d=d+" "+j
        final_sentence.append(d.lower())        #After removing stopword it will add the reamining text of that document to the list
        d=""
    return final_sentence

In [3]:
# This function creates a dictionary which stores total length of data, each class, no. of rows belong to that particular class, total count of words that belong to particular class and their corresponding word count. It develops for training the model.
def fit_function(x_train,count,y_train):
    result={}
    classes=set(y_train)     #classes variable store unique value of y
    result['total_val']=len(y_train)    # stores total length of testing data into a dictionary key i.e 'total_val'
    for current_class in classes:
        result[current_class]={}
        current_row=(y_train==current_class)     #choose only that rows of y_train data which is equal to current class
        current_x=x_train[current_row]           #choose only that x_train data which belongs to a particular class 
        current_y=y_train[current_row]
        result[current_class]['total_len_count']=len(current_y)     #Store total length of current_class into 'total_len_count' key of dictionary
        result[current_class]['total_sum_count']=current_x.sum()    #Store total count of all words that belong to a particular class into 'total_sum_count' key of dictionary 
        sum_wrds=current_x.sum(axis=0)
        for word,idx in count.vocabulary_.items():
            result[current_class][idx]=sum_wrds[0,idx]         #Store count of each word 
    return result
            
            

In [4]:
#This function calcluate probabaility of current_class from total testing data and calculate probability of each word to total no. of words
def probability(current_class,dictionary,x):
    output=np.log(dictionary[current_class]['total_len_count'])-np.log(dictionary['total_val'])      #Calculating probability of current classs to total testing testing data
    test=x.tocoo()           #convert scipy_matrix to list which store column no. and row no. and row count of particular row to txt variable
    for each in dictionary[current_class].keys():      #Iterate all word from dictionary except 'total_len_count' and 'total_sum_count' which store total length and total sum of counts of words of that particular class
        if each=='total_len_count' or each=='total_sum_count':
            continue
        if each in test.col:       #check wheather that word exist in testing document or not 
            count_current_class_rows=dictionary[current_class][each] +1     #Calculating count of a word with laplace correction
            count_current_class=dictionary[current_class]['total_sum_count']+len(test.col)         #Calculating count of whole word that belong to a particular class with laplace correction                
            current_xj_probability=np.log(count_current_class_rows)-np.log(count_current_class)    #calculate probability
            output=output+current_xj_probability
    return output
            
            

In [5]:
#This function call probability function for each class and check probability of each class which class has higher probability. After getting high probability that class becomes output
def prediction(dictionary,x):
    best_class=-1
    best_prob=-1000
    first=True
    for current_class in dictionary.keys():     #Pick all classes from dictionary 
        if current_class=='total_val':          
            continue
        prb=probability(current_class,dictionary,x)     #Calling Probability function which returns probability of particular class
        if prb>best_prob or first:       #Comparing probability
            best_prob=prb
            best_class=current_class
        first=False
    return best_class
            
    
    
    

In [6]:
# This function takes dictionary as a parameter which creates during fit function and x_test.It call prediction function for each document .   
def predct(dictionary,x_test):
    pred=[]
    x_data=x_test
    for x in x_test:
        y=prediction(dictionary,x)   #Call Prediction function for each document of testing data   
        pred.append(y)
    return pred

In [10]:
from sklearn.feature_extraction.text import CountVectorizer     #Importing CountVectorizer creates a matrix in which each unique word is represented by a column of the matrix, and each text sample from the document is a row in the matrix.
from sklearn.naive_bayes import MultinomialNB                   #Importing MultinomialNB for text classification
from sklearn.metrics import confusion_matrix, classification_report      
vect=CountVectorizer(max_features=4000)
main_data=clean_data(train)               #Calling Clean_data function for removing stopwords and punctuation from document.


x_train=vect.fit_transform(main_data)
dictionary=fit_function(x_train,vect,traindata.target)

clsfr=MultinomialNB()
clsfr.fit(x_train,traindata.target)
x_test=vect.transform(test_data.data)
y=clsfr.predict(x_test)
y_pred=predct(dictionary,x_test)


print(confusion_matrix(y,test_data.target))
print(classification_report(y,test_data.target))
print(confusion_matrix(y_pred,test_data.target))
print(classification_report(y_pred,test_data.target))




[[304   5   4   0   0]
 [ 10 354 192  42  31]
 [  0   0   1   1   1]
 [  2  13 162 310  56]
 [  3  17  35  39 297]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       313
           1       0.91      0.56      0.70       629
           2       0.00      0.33      0.01         3
           3       0.79      0.57      0.66       543
           4       0.77      0.76      0.77       391

    accuracy                           0.67      1879
   macro avg       0.69      0.64      0.62      1879
weighted avg       0.85      0.67      0.74      1879

[[307   5   8   1   2]
 [  7 347 178  30  17]
 [  0   0   1   0   1]
 [  1  10 140 300  26]
 [  4  27  67  61 339]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       323
           1       0.89      0.60      0.72       579
           2       0.00      0.50      0.01         2
           3       0.77      0.63      0.69       477
        