<b>Naive-Bayes-Classification (NBC) - Scratch</b> <br>
<i>Implementing naive bayes classficiation using only NumPy, step-by-step. </i>

<b>requirements</b>

In [None]:
# example:- pip install numpy

<b>imports</b>

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn import datasets
from collections import Counter

<b>(1) DATA PRE-PROCESSING</b>

In [4]:
# Read Raw Dataset - my dataset
dataset  = pd.read_csv('../../../datasets/male_female.csv')

# sklearn Dataset
#X,y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=4, n_informative=4, n_clusters_per_class=4, random_state=123)

# Features (X) & Dependent-Variable(y)
X = dataset.iloc[:, :3]
y = dataset.iloc[:, -1]


# Data-Splitting 
# sklearn Dataset X & y
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1225)

# Data Pre-Processing
# Normalize (if needed)


In [None]:
# Data Visualization
# ## DataSet Analysis - Training Set
# print("\nDataSet Analysis - Training Set : ")
# print(X_train.shape) # (120,4) = (rows,cols) = (datapoints,Features), 4-Features  [x1 x2 x3 x4]
# print(X_train[0])    # First-Row with 4-Features like x_train[1] = [4.6  3.6  1.0  0.2]
# print(y_train.shape) # (120) = (rows) = (datapoints), 1D Row or col Vector 
# print(y_train[0])    # First-element, shows the class-label (y+train) of the Features x_train[1]
# print(y_train)       # 1D Vector with only 1 Row
# print()

# ## DataSet Analysis - Test Set
# print("DataSet Analysis - Test Set : ")
# print(X_test.shape) # (30,4) = (rows,cols) = (datapoints,Features), 4-Features  [x1 x2 x3 x4]
# print(X_test[0])    # First-Row with 4-Features like x_test[1] = [4.6  3.6  1.0  0.2]
# print(y_test.shape) # (30) = (rows) = (datapoints), 1D Row or col Vector 
# print(y_test[0])    # First-element, shows the class-label (y_test) of the Features x_test[1]
# print(y_test)      # 1D Vector with only 1 Row
# print()

# ## DataSet Analysis - Plotting 
# plt.figure()
# plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap, edgecolors='k', s=20) # Plotting first 2-Features out of 4
# plt.show()



<b>(2) ML ALGORITHM - SCRATCH</b>

In [5]:
class NaiveBayes:
    
    # (2.1) Fit Model
    def fit(self, X, y) -> None:
        """
        Calculates Mean, Variance, Priors for each class. Uses X_train and y_train
        The learned Mean, Variance, Priors from X_train and y_train is used to predict class label of X_test
        
        Parameters: 
            X : (np.array) Independent-Variable (Features)
            y : (np.array) Dependent-Variable  (Class-labels)
        
        Returns:
            None 
        """
        
        # Rows-Cols of Features
        n_data, n_features = X.shape
        
        # Class-labels 
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        
        # Initialize Mean, Variance, Priors
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64) # Each Class has mean of Inividual Features 
        self._var = np.zeros((n_classes, n_features), dtype=np.float64) 
        self._priors = np.zeros((n_classes), dtype=np.float64) 
        
        # Calculate Mean, Variance, Prior
        for idx, c in enumerate(self._classes):
            
            # Datapoints(rows of Features) corresponding to class c
            X_c = X[c==y]
            
            # mean, var 
            self._mean[idx,:] = X_c.mean(axis=0) # np.mean(X_c, axis=0) # axis=0 is "mean" of each feature/col like (x1)
            self._var[idx, :] = X_c.var(axis=0, ddof=1) # can also write np.var(X_c, axis=0, ddof=1) ddof=1(Bessel Correction)

            # prior
            self._priors[idx] = X_c.shape[0]/n_data
 
 
    # (2.2) Making-Predictions
    def predict(self, X) -> np.ndarray:
        """
        Predicts the class-labels for given X. Here X is X_test set. Stores in y_pred list
        
        Parameters: 
            X : (np.array) Independent-Variable (Features) uppercase X = Multiple Datapoints (Feature-Matrix)
        
        Returns:
            y_pred as a numpy array
        """
        
        # Predicting y_pred for all individual-elements(x) of X
        y_pred = [self._predict(x) for x in X] # List Comprehension

        # returns y_pred as np array
        return np.array(y_pred)
        
        
    # (2.3) Making-Single-Prediction
    def _predict(self, x) -> list:
        """
        Applys Bayes-Theorem for each class-label. Calculates Posterior for each Class 
        
        Parameters: 
            x : (np.array) Independent-Variable (Features) lowercase  x = single Datapoint
        
        Returns:
            Estimated or predicted Class Label as list or dict
        """
        
        # List Storing all Posterior Probabilities
        posteriors = [] # can be dict as well
        
        # Calculate Posterior P(Y|X)

        for c_idx, c in enumerate(self._classes):
            
            # Calc Prior for Class-Label C & log
            prior = np.log(self._priors[c_idx])

            # Calc Likelihood for Class-Label C & given Feature-vector X_test = x = (x1 x2 x3) & log
            likelihood = np.sum(np.log(self._pdf(c_idx, x)))

            # Calc Posterior for class C and Given Feature-vector X_test = x = (x1 x2 x3)
            posterior  = likelihood + prior # sum due to log, orelse product
            
            # Add all posterior
            posteriors.append(posterior)
            
        # Estiamte class from maximum of the posteriors
        est_class_label = self._classes[np.argmax(posteriors)]
        
        # Return Estimated Class Label
        return est_class_label
       
       
    # (2.4) Probability-Distribution-Function (pdf)
    def _pdf(self, class_index, x) -> float:
        """
        Calculates Guassian Distribution (Gaussain pdf). Also called Normal Distribution
        Can also use Bernoullie Distribution
        
        Parameters: 
            class_index : (int) Class index of classes c in self._classes. 
            x : (np.array) Independent-Variable (Features) lowercase  x = single Datapoint
        
        Returns:
            Gaussian Distribution as float 
        
        """
      
        # Means & Variances of class[class_index]
        mean = self._mean[class_index] # e.g- y1_mean = x1_mean, x2_mean, x3_mean
        var = self._var[class_index]   # e.g- y1_var  = x1_var,  x2_var,  x3_var 
        
        # Gaussian-Distribution
        gauss_dist = (1/np.sqrt(2 * np.pi * var)) * np.exp(-(x-mean)**2/(2 * var))

        print(f"cls : {self._classes[class_index]} | Feature: {x} | gauss : {gauss_dist}")
        
        # returns pdf
        return gauss_dist 


<b>(3) MODEL TRAINING</b>

In [6]:
# my Custom Test set
X_test = np.array([[6, 130, 8]]) # must be 2D array
X_train = X
y_train = y

# Creating NaiveBayes Instance
nb = NaiveBayes()

# Fit (Training) The NaiveBayes model with the Training-Datasets
nb.fit(X_train,y_train)


<b>(4) PREDICTION</b>

In [7]:
import time
start = time.time()

# Predicting on Test-Set using the trained NaiveBayes model (by the learned 'mean' 'var '& 'priors')
predictions = nb.predict(X_test)
print(f"\nPredicted Class-Label : {predictions}")

end = time.time()
print(f"time elapsed : {(end-start)*1000} ms\n")


cls : female | Feature: [  6 130   8] | gauss : [0.22345873 0.0167893  0.2866907 ]
cls : male | Feature: [  6 130   8] | gauss : [1.57888318e+00 5.98674302e-06 1.31122104e-03]

Predicted Class-Label : ['female']
time elapsed : 2.999544143676758 ms



<b>(5) EVALUATION-VISUALIZATION</b>

In [None]:
#Metrics 
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred)/ len(y_true)
#print(f"accuracy : {accuracy(y_test,predictions)}")

# Accuracy
# accuracy = np.sum(predictions == y_test) / len(y_test)
# print(f"accuracy : {accuracy*100} %")

<b>CONCLUSION</b>
- The model performs well