# Text Classification on 20newsgroup data

In [1]:
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# taking 20_newsgroup as my training data 

In [2]:
# importing dataset from my directory
from sklearn import datasets
train_data = datasets.load_files(r"C:\Users\shivamGarg\Documents\20_newsgroups")

In [3]:
# input_train_data will consist  all the document which i will used for classification
# output_train_data is the list of all classes corresponding to each document
input_train_data = train_data.data
output_train_data = train_data.target
print("Length of input_train_data :",len(input_train_data))


Length of input_train_data : 19997


# Data Cleaning

In [4]:
# importing libraries and modules
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords , wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string 
from string import punctuation

# importing English stopwords 
stop=stopwords.words('English')
stop.append('path')
print(stop)
print()

# importing punctuations
punct=list(punctuation)
print(punct)

# lemmatizer is a process of converting a word to its base form according to the context
lemmatizer=WordNetLemmatizer()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
#function to get part of speech understandable by lemmatizer
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else :
        return wordnet.NOUN
    
# function to clean data    
def data_cleaning(input_data):

    output=[]
    # traversing each document one by one 
    for i in range(len(input_data)):
        
        # currently my input document is of byte class so first converting it to string class
        input_data_in_string=str(input_data[i])
      
        # converting my string document into list of characters 
        input_data_letters_list=list(input_data_in_string)
        
        # this loop is to remove \n from the document 
        # if current character is n and its previous character is '\' in list means in string form it is '\n'
        for i in range(len(input_data_letters_list)):
            if i>0 and  input_data_letters_list[i]=='n' and input_data_letters_list[i-1]=="\\":
                input_data_letters_list[i]=' '
        # this loop is to remove punctuations and digits 
        for i in range(len(input_data_letters_list)):
            if input_data_letters_list[i] in punct:
                input_data_letters_list[i]=' '
            elif input_data_letters_list[i]>='0' and input_data_letters_list[i]<='9':
                input_data_letters_list[i]=' '
        # again forming the original string from the list of characters
        clean_string=''.join(input_data_letters_list)

        # tokenizing my string ,i.e.. converting it in list of words 
        current_word_list = word_tokenize(clean_string.lower())
        new_word_list=[]
        # looping over each word of document
        for w in current_word_list:
            # only add this word to document if its length is greater than 2 and it is not a stopword
            if len(w)>2 and w not in stop:
                pos_=pos_tag([w])
                clean_word = lemmatizer.lemmatize(w,get_simple_pos(pos_[0][1]))
                new_word_list.append(clean_word)
        
        output.append(new_word_list)
    return output     
            
    

In [6]:
# cleaning the training_data
cleaned_input_train_data = data_cleaning(input_train_data)
print("Length of cleaned_input_train_data :",len(cleaned_input_train_data))

Length of cleaned_input_train_data : 19997


In [7]:
# creating dictionary of words from my training_data_documents
dictionary={}
for document in cleaned_input_train_data:
    for word in document:
        if word in dictionary.keys():
            dictionary[word]=dictionary[word]+1
        elif len(word)>2 :
            dictionary[word]=1
print("Dictionary contains total ",len(dictionary.keys()),"keys" )
# rather than using all words as features ,using approximately most frequent 5k words
# creating new dictonary of words  , which is used as features
vocabulary={}
cut_off=80
for key in dictionary:
    if dictionary[key]>cut_off:
         vocabulary[key]=dictionary[key]
feature_list = list(vocabulary.keys())
print("Vocabulary contains total ",len(vocabulary.keys()),"keys" )

Dictionary contains total  111656 keys
Vocabulary contains total  5468 keys


In [8]:
# creating a numpy array of dimension( num_datapoints x num_features)
# num_datapoints equals to number of documents
# num_feature equal to number of keys in my vocabulary
def  create_dataset_from_string_data(input_data):
            num_datapoints=len(input_data)
            num_features=len(vocabulary.keys())
            new_input_data=np.zeros((num_datapoints,num_features))
            # creating my train_data from the documnets in the list X
            current_row=0
            for current_document in input_data:
                    for word in current_document:
                        if word in vocabulary.keys():
                            pos_y = feature_list.index(word)
                            new_input_data[current_row][pos_y] = new_input_data[current_row][pos_y]+1
                    current_row=current_row+1
            return new_input_data
        

In [9]:
# final data used as input 
input_training_data = create_dataset_from_string_data(cleaned_input_train_data)
print("Dimension of training data ",input_training_data.shape)
input_training_data

Dimension of training data  (19997, 5468)


array([[1., 2., 4., ..., 0., 0., 0.],
       [2., 3., 4., ..., 0., 0., 0.],
       [1., 1., 3., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 3., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

# implemening Naive bayes from scratch 

In [10]:
# fit function creates a dictionary input_dictionary  whose keys is classes(20 classes)
# fit function also conatin a key named "total_rows" which is equal to the  len of Y_train
# input_dictionary[current_class] is also a dictionary whose keys are features of training_data
# input_dictionary[current_class] conatins two more key one is "total_current_class_rows" whose value equal to number of rows 
# corrosponds  to that class another is "total_current_class_words" which gives  all words( sum along rows then along column) 
# for that class
def fit(input_training_data , output_training_data ):
        input_dictionary = {}
        class_values = set(output_training_data)
        input_dictionary["total_rows"] = len(output_training_data)
        
        # making dictionary for each class
        for current_class in class_values:
            input_dictionary[current_class] = {}
            current_class_rows = (output_training_data == current_class)
            input_training_current = input_training_data[current_class_rows]
            output_training_current = output_training_data[current_class_rows]
            input_dictionary[current_class]["total_current_class_rows"] = len(output_training_current)
            columns_sum = np.sum(input_training_current,axis=0)
            features_list = input_training_current.shape[1]
            input_dictionary[current_class]["total_current_class_words"]=0
            for j in range(features_list):
                input_dictionary[current_class][j] = columns_sum[j]
                input_dictionary[current_class]["total_current_class_words"] += columns_sum[j]
        
        # final dictionary
        return input_dictionary         

In [11]:
input_dictionary = fit(input_training_data , output_train_data)

In [12]:
# calculates the probability for a particular class
def probability(input_dictionary , current_row , current_class):
    
    # intializing the result with prob(y==curren_class)
    output = np.log(input_dictionary[current_class]["total_current_class_rows"]) - np.log(input_dictionary["total_rows"])
    total_features = len(current_row)
  
    for current_feature in range(total_features):
        count_current_class_words_with_current_feature = input_dictionary[current_class][current_feature]+1
        count_current_class_total_words = input_dictionary[current_class]["total_current_class_words"] +total_features
        # prob(current_feature/ y==current_class)
        current_probablity =  current_row[current_feature]*(np.log(count_current_class_words_with_current_feature) - np.log(count_current_class_total_words))
        output = output + current_probablity
   
    # probablity of current_class
    return output

In [13]:
# predict the class for a particular row
def predictSinglePoint(input_dictionary , current_row ):
    
    # list of possible classes 
    classes = input_dictionary.keys()
    best_prob=None
    best_class=None
    check=True
    for current_class in classes:
        if current_class=="total_rows":
             continue

        # finding probablity of each possible class 
        current_class_prob = probability(input_dictionary , current_row , current_class)
        
        # comparing with current probablity and updating if current class has more probablity then the previos one
        if(check or current_class_prob>best_prob):
            best_prob=current_class_prob
            best_class=current_class
        check=False
    # returning the best_class which highest probablity corrosponds the input
    return best_class    
        

In [14]:
# function to predict output 
def predict(input_dictionary ,input_test):
    output_test=[]
    for current_input in input_test:
        current_ans = predictSinglePoint(input_dictionary ,current_input)
        output_test.append(current_ans)
    return output_test    

In [15]:
# function to compare score 
def score(predicted_output,actual_output):
        # count to maintain number of correct predicted values
        count = 0
        for i in range(len(predicted_output)):
            if predicted_output[i] == actual_output[i]:
                count+=1
        # returning the score 
        return count/len(predicted_output)

# taking mini_newsgroup as my test data 

In [16]:
#loading testing data
test_data=datasets.load_files(r"C:\Users\shivamGarg\Documents\mini_newsgroups")


In [17]:
# input_test_data consist of all the documnets for testing
# output_test_data is the list of classes corrosponds to input_test_data documents
input_test_data = test_data.data
output_test_data = test_data.target
print("Length of input_test_data :",len(input_test_data))

Length of input_test_data : 2000


In [18]:
# cleaning the testing_data
cleaned_input_test_data = data_cleaning(input_test_data)
print("Length of cleaned_input_test_data :",len(cleaned_input_test_data))

Length of cleaned_input_test_data : 2000


In [19]:
# final data used for testing 
input_testing_data = create_dataset_from_string_data(cleaned_input_test_data)
print("Dimension of testing data ",input_testing_data.shape)
input_testing_data

Dimension of testing data  (2000, 5468)


array([[1., 2., 3., ..., 0., 0., 0.],
       [1., 1., 3., ..., 0., 0., 0.],
       [1., 2., 3., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 2., 4., ..., 0., 0., 0.]])

In [20]:
# predicting the output corrosponds to testing data
predicted_output = predict(input_dictionary,input_testing_data)

In [34]:
# actual classes name of output
class_names = train_data.target_names
print(class_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [36]:
predicted_output_class = []
output_test_data_class = []

for i in range(len(predicted_output)):
            predicted_output_class.append(class_names[predicted_output[i]])
            output_test_data_class.append(class_names[output_test_data[i]])

print("Score on test data from scratch : ",score(predicted_output_class,output_test_data_class)*100)
print()
print(classification_report(output_test_data_class , predicted_output_class ))
print(confusion_matrix( output_test_data_class ,predicted_output_class))

Score on test data from scratch :  91.9

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.87      0.85       100
           comp.graphics       0.88      0.86      0.87       100
 comp.os.ms-windows.misc       0.91      0.90      0.90       100
comp.sys.ibm.pc.hardware       0.80      0.88      0.84       100
   comp.sys.mac.hardware       0.88      0.95      0.91       100
          comp.windows.x       0.97      0.85      0.90       100
            misc.forsale       0.91      0.97      0.94       100
               rec.autos       0.99      0.98      0.98       100
         rec.motorcycles       0.95      0.99      0.97       100
      rec.sport.baseball       0.99      0.98      0.98       100
        rec.sport.hockey       0.99      0.97      0.98       100
               sci.crypt       0.99      0.98      0.98       100
         sci.electronics       0.94      0.93      0.93       100
                 sci.med       0.9

In [37]:
# now testing my data with inbuilt Multinomial NB
# using training data to train inbuilt naive bayes algorithm
clf = MultinomialNB()
clf.fit(input_training_data , output_train_data)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
predicted_output_NB = clf.predict( input_testing_data )

In [39]:

predicted_output_class_NB = []

for i in range(len(predicted_output)):
            predicted_output_class_NB.append(class_names[predicted_output_NB[i]])

print("Score on test data from Inbuilt NB : ",score(predicted_output_class_NB,output_test_data_class)*100)
print()
print(classification_report(output_test_data_class , predicted_output_class_NB ))
print(confusion_matrix( output_test_data_class ,predicted_output_class_NB))
       

Score on test data from Inbuilt NB :  91.9

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.87      0.85       100
           comp.graphics       0.88      0.86      0.87       100
 comp.os.ms-windows.misc       0.91      0.90      0.90       100
comp.sys.ibm.pc.hardware       0.80      0.88      0.84       100
   comp.sys.mac.hardware       0.88      0.95      0.91       100
          comp.windows.x       0.97      0.85      0.90       100
            misc.forsale       0.91      0.97      0.94       100
               rec.autos       0.99      0.98      0.98       100
         rec.motorcycles       0.95      0.99      0.97       100
      rec.sport.baseball       0.99      0.98      0.98       100
        rec.sport.hockey       0.99      0.97      0.98       100
               sci.crypt       0.99      0.98      0.98       100
         sci.electronics       0.94      0.93      0.93       100
                 sci.med       