In [6]:
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

In [7]:
os.chdir(r'C:\Users\sonaw\OneDrive\Desktop\CS229\Practice\Machine-Learning-Algorithm-Implementation\Naive Bayes\dataset2')
X = np.load('X.npy')
y = np.load('Y.npy')
dic = np.load('dictionary.npy')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

In [9]:
class NaiveBayes:
        
    def fit(self, X, y):
        '''
        This function / method fits parameters of Naive Bayes / Event Model algorithm to the data.
        
        parameters:
        X: Feature matrix of shape (m, n)
            m = Number of training examples
            n = Size of the vocabulary
        y: Target / label vector of shape (m, 1)
        
        output:
        This method calculates the parameters of Naive Bayes / Event Model algorithm and save them as
        instance attributes of the class.
        phi_y1: List of probabilities (P(xi=1|y=1))
        phi_y0: List of probabilities (P(xi=1|y=0))
        phi_y: Parameter phi_y (P(y=1))
        '''
        self.y = y
        self.X = X
        self.m = X.shape[0]
        self.n = X.shape[1]
        self.phi_y = np.sum(self.y) / self.m
        self.phi_y1 = np.empty(shape = (self.n, 1))
        self.phi_y0 = np.empty(shape = (self.n, 1))
        for j in range(self.n):
            count_1 = 0
            count_2 = 0
            for i, k in zip(self.X[:,j], self.y):
                if i == 1 and k == 1:
                    count_1 = count_1 + 1
                if i == 1 and k == 0:
                    count_2 = count_2 + 1
            self.phi_y1[j][0] = count_1 / np.sum(self.y)
            self.phi_y0[j][0] = count_2 / np.sum(np.where(self.y==0, 1, 0))
            
    def predict(self, X_test):
        '''
        This function / method predicts classes for examples in feature matrix X_test.
        
        parameters:
        X_test: Feature matrix of shape (n_samples, vocabulary)
            n_samples = Number of samples
            vocabulary = Length of vocabulary / dictionary
            
        output:
        y_pred: Prediction vector of shape (n_samples, 1)
        '''
        y_pred = np.empty(shape = (X_test.shape[0], 1))
        for i in range(X_test.shape[0]):
            y_pred[i][0] = self.predict_example(X_test[i, :])
        return y_pred
    
    def predict_example(self, X_example):
        '''
        This function / method predicts class of a single example.
        
        parameters:
        X_example: Feature matrix containing single example of shape (vocabulary, 1)
        
        output:
        class of example: 0 / 1 depending on whether P(y=1|x) or P(y=0|x) is greater
        '''
        # First lets calculate P(x|y=1) and P(x|y=0)
        px_y1 = 1
        px_y0 = 1
        for i in range(X_example.shape[0]):
            px_y1 = px_y1 * ((self.phi_y1[i][0]) ** X_example[i]) * ((1- self.phi_y1[i][0]) ** (1 - X_example[i]))
            px_y0 = px_y0 * ((self.phi_y0[i][0]) ** X_example[i]) * ((1- self.phi_y0[i][0]) ** (1 - X_example[i]))
            
        # Calculate P(y=1|x) and P(y=0|x) using Bayes rule
        py1_x = ((px_y1 * (self.phi_y)) + 1) / (((px_y1 * (self.phi_y)) + (px_y0 * (1 - self.phi_y))) + 2)
        py0_x = ((px_y0 * (1 - self.phi_y)) + 1) / (((px_y1 * (self.phi_y)) + (px_y0 * (1 - self.phi_y))) + 2)
        
        if py1_x > py0_x:
            return 1
        else:
            return 0
        
def accuracy_calc(y, y_pred):
    '''
    This function calculates accuracy of the classification.
        
    parameters:
    y: Target / label vector of shape (m, 1)
    y_pred: Predicted vector of shape (m, 1)
            
    Outputs:
    accuracy: accuracy of the classification in percentage.
    '''
    length = y.shape[0]
    y = y.reshape((length, 1))
    y_pred = y_pred.reshape((length, 1))
    accuracy = np.round(np.sum(np.where(y == y_pred, 1, 0)) / len(y), 2) * 100
    return accuracy

In [10]:
clf1 = NaiveBayes()
clf1.fit(X_train, y_train)
y_test_pred = clf1.predict(X_test)
test_acc = accuracy_calc(y_test, y_test_pred)
print('Test accuracy: {}'.format(test_acc))

Test accuracy: 75.0


In [11]:
from sklearn.naive_bayes import BernoulliNB
clf2 = BernoulliNB()
clf2.fit(X_train, y_train)
y_test_pred = clf2.predict(X_test)
acc_sklearn = accuracy_calc(y_test, y_test_pred)
print('Test accuracy using Sci-kit learn library (Bernouli NB) is: {}'.format(acc_sklearn))
average_accuracy = np.round(clf2.score(X_train, y_train), 2) * 100
print('Average test accuracy using Sci-kit learn library (Bernouli NB) is: {}'.format(average_accuracy))

Test accuracy using Sci-kit learn library (Bernouli NB) is: 94.0
Average test accuracy using Sci-kit learn library (Bernouli NB) is: 94.0


In [12]:
from sklearn.naive_bayes import MultinomialNB
clf3 = MultinomialNB()
clf3.fit(X_train, y_train)
y_test_pred = clf3.predict(X_test)
acc_sklearn = accuracy_calc(y_test, y_test_pred)
print('Test accuracy using Sci-kit learn library (Multinomial NB) is: {}'.format(acc_sklearn))
average_accuracy = np.round(clf3.score(X_train, y_train), 2) * 100
print('Average test accuracy using Sci-kit learn library (Multinomial NB) is: {}'.format(average_accuracy))

Test accuracy using Sci-kit learn library (Multinomial NB) is: 99.0
Average test accuracy using Sci-kit learn library (Multinomial NB) is: 99.0
