<a href="https://colab.research.google.com/github/shrestha-bikash/Naive-Bayes-Text-classification/blob/main/Naive_Bayes_Tesxt_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
stpwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
class NaiveBayes:
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
        print(self.df.head(10))
        print(self.df.tail(10))

        self.class_count = self.df['Class'].value_counts()
        self.total_count = self.df['Class'].count()
        print(self.class_count)

    def cleaning_data(self, data):
        # removing the punctuations from the sentences
        remove_punc = [char for char in data if char not in string.punctuation]
        remove_punc = ''.join(remove_punc)

        # removing stopwords and returning the cleaned data
        return [word.lower() for word in remove_punc.split() if word.lower() not in stpwords]
    
    def generate_keyword(self, data):
        keywords = []
        for item in data:
            for word in item:
                if word not in keywords:
                    keywords.append(word)
        return keywords
    
    def count_words(self, data, keywords):
        # print(data)
        count_arr = []

        for word in keywords:
            count = 0
            for item in data:
                if item == word:
                    count += 1

            count_arr.append(count)

        return count_arr

    def binary_count_words(self, data, keywords):
        count_arr = [1 if word in data else 0 for word in keywords]
        return count_arr

    # calculate and store the conditional probability of each word
    def calculate_cond_prob(self, word_count, total_word_count, total_keywords):
        # applied laplace smoothing: adding 1 to the numerator as well as the size of the vocabulary(keywords) 
        # to the denominator to balance it.
        return (word_count + 1)/(total_word_count + total_keywords)

    def fit(self, is_binary):
        doc = self.df['Document']
        cleaned_doc = doc.apply(self.cleaning_data)

        self.keywords = self.generate_keyword(cleaned_doc)
        self.len_keywords = len(self.keywords)
        print('keywords:', self.keywords)

        # making a binary vectorize array for each row(i.e document) in the dataset
        binary_vectorize_arr = cleaned_doc.apply(self.binary_count_words, args=[self.keywords])

        # making a vectorize array for each row(i.e document) with the integer value count of it's keyword
        integer_vectorize_arr = cleaned_doc.apply(self.count_words, args=[self.keywords])

        if is_binary:
            print('Binary vectorize word count:\n', binary_vectorize_arr[:10])
            vectorize_arr = binary_vectorize_arr
        else:
            print('Integer value vectorize word count:\n', integer_vectorize_arr[:10])
            vectorize_arr = integer_vectorize_arr

        new_df = pd.DataFrame({'count_vector': vectorize_arr, 'class': self.df['Class']})

        cs_only = new_df.loc[new_df['class'] == 'cs']
        noncs_only = new_df.loc[new_df['class'] == 'non-cs']


        # calculating the total word count of each word belonging to cs class, and storing as a vector
        sum_count_cs = [sum(row[i] for row in cs_only['count_vector']) for i in range(self.len_keywords)]
        # total number of words present in cs class
        total_word_count_cs = sum(sum_count_cs)
        print('total word counts for class cs:', total_word_count_cs)

        # calculating the total word count of each word belonging to non-cs class, and storing as a vector
        sum_count_noncs = [sum(row[i] for row in noncs_only['count_vector']) for i in range(self.len_keywords)]
        # total number of words present in non-cs class
        total_word_count_noncs = sum(sum_count_noncs)
        print('total word count for class non-cs:', total_word_count_noncs)

        # For class cs
        self.cs_cond_prob = {}
        # For class non-cs
        self.noncs_cond_prob = {}

        for index, item in enumerate(self.keywords):
            # storing conditional probability of each keyword seperately based on the class
            self.cs_cond_prob[item] = self.calculate_cond_prob(sum_count_cs[index], total_word_count_cs, self.len_keywords)
            self.noncs_cond_prob[item] = self.calculate_cond_prob(sum_count_noncs[index], total_word_count_noncs, self.len_keywords)


    def calculate_prior_prob(self, test_data, prob_dict):
        prod_total = 1
        for item in test_data:
            if item in self.keywords:
                item_prob = prob_dict[item]
            else:
                # if the keyword from the test dataset does not present in the keywords set, give it a small probability
                item_prob = 0.0001
            prod_total = prod_total * item_prob
        
        return prod_total

    def predict(self, test_df):
        self.test_data = test_df['Document']
        self.test_label = test_df['Class']
        # calculating the class probability
        class_prob = {}
        class_prob['cs'] = self.class_count['cs']/self.total_count
        class_prob['noncs'] = self.class_count['non-cs']/self.total_count

        # store predictions
        self.prediction = []

        print('\n\n *** Prediction *** \n')
        for test in self.test_data:
            test_cleaned = self.cleaning_data(test)

            # calculating the posterior probabilities
            test_cs_prob = class_prob['cs'] * self.calculate_prior_prob(test_cleaned, self.cs_cond_prob)
            test_noncs_prob = class_prob['noncs'] * self.calculate_prior_prob(test_cleaned, self.noncs_cond_prob)

            if test_cs_prob > test_noncs_prob:
                self.prediction.append('cs')
            else:
                self.prediction.append('non-cs')
        
        for i in range(len(self.prediction)):
            print('Predicted class:', self.prediction[i], ', Actual Class:', self.test_label[i])

    
    def calculate_accuracy(self):
        self.accuracy = int(np.count_nonzero(self.prediction == self.test_label))
        print("Accuracy: %3.2f%%"%((self.accuracy/len(self.test_label))*100))

## Preview of Training Set

In [None]:
NB = NaiveBayes('Dataset.csv')

   Id                                           Document Class
0   1  Search a sorted array by repeatedly dividing t...    cs
1   2  data structure is a particular way of organizi...    cs
2   3  linked list is represented by a pointer to the...    cs
3   4  The left and right subtree each must also be a...    cs
4   5  Machine Learning is the field of study that gi...    cs
5   6  Linear Regression is a machine learning algori...    cs
6   7  Regression models a target prediction value ba...    cs
7   8  Like arrays, Linked List is a linear data stru...    cs
8   9  Extra memory space for a pointer is required w...    cs
9  10  Gradient Descent is an optimization algorithm ...    cs
    Id                                           Document   Class
25  26  If you are unable to make it to a safe shelter...  non-cs
26  27  business administration covers the breadth of ...  non-cs
27  28  Pranayama arouses the internal energy of a per...  non-cs
29  30  Commerce Bank offers personal and b

## Preview of Test Set

In [None]:
test_doc = ['Artificial intelligence is emerging rapidly in the field of science',
        'Management of products using computer is important in business',
        'Dealing with passwords is about as pleasant as cleaning gutters or filing taxes',
        'Understand the structure of singly linked list and doubly linked list',
        'A handful of companies are working to turn household trash into low-emissions fuels for planes, trains and trucks',
        'Technology stocks surged, helping the Nasdaq Composite rebound a day after sliding into correction territory',
        'Binary Naive Bayes classifiers for detecting spam emails']
test_class = ['cs', 'non-cs', 'non-cs', 'cs', 'non-cs', 'non-cs', 'cs']

test_df = pd.DataFrame({'Document': test_doc, 'Class': test_class})

test_df.head(7)

Unnamed: 0,Document,Class
0,Artificial intelligence is emerging rapidly in...,cs
1,Management of products using computer is impor...,non-cs
2,Dealing with passwords is about as pleasant as...,non-cs
3,Understand the structure of singly linked list...,cs
4,A handful of companies are working to turn hou...,non-cs
5,"Technology stocks surged, helping the Nasdaq C...",non-cs
6,Binary Naive Bayes classifiers for detecting s...,cs


## Case I with Binary value count vector

In [None]:
# For case I
NB.fit(is_binary=True)
NB.predict(test_df)

print('\nTest set Accuracy')
NB.calculate_accuracy()

Binary vectorize word count:
 0    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7    [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
8    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
Name: Document, dtype: object
total word counts for class cs: 164
total word count for class non-cs: 141


 *** Prediction *** 

Predicted class: cs , Actual Class: cs
Predicted class: non-cs , Actual Class: non-cs
Predicted class: non-cs , Actual Class: non-cs
Predicted class: cs , Actual Class: cs
Predicted class: non-cs , Actual Class: non-cs
Predicted class: cs , Actual Class: non-cs
Predicted class: cs , Actual 

## Case II with Integer value count vector

In [None]:
# For case II
NB.fit(is_binary=False)
NB.predict(test_df)

print('\nTest set Accuracy')
NB.calculate_accuracy()

Integer value vectorize word count:
 0    [2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7    [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
8    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
Name: Document, dtype: object
total word counts for class cs: 172
total word count for class non-cs: 145


 *** Prediction *** 

Predicted class: cs , Actual Class: cs
Predicted class: non-cs , Actual Class: non-cs
Predicted class: non-cs , Actual Class: non-cs
Predicted class: cs , Actual Class: cs
Predicted class: non-cs , Actual Class: non-cs
Predicted class: cs , Actual Class: non-cs
Predicted class: cs , 

### There is an accuracy of 85.71% because our algorithm classifies one document incorrectly. The sixth document in our test set belongs to "non-cs" class but the algorithm classifies it as a "cs" class.

In [None]:
! jupyter nbconvert --to html Project3_Naive_Bayes.ipynb

[NbConvertApp] Converting notebook Project3_Naive_Bayes.ipynb to html
[NbConvertApp] Writing 323308 bytes to Project3_Naive_Bayes.html
