# Gaussian Naive Bayes

In [15]:
import numpy as np

class GaussianNB:

    def fit(self, X, y):
        """ fit the training data, for likelihood we using Gaussian distribution
        :param X: shape(n_samples, n_features)
        :param y: shape(n_samples,)
        """
        self.n_iters = 1000
        n_samples, n_features = X.shape

        self.classes = np.unique(y)  # Get unique class labels
        n_classes = len(self.classes)

        # initialize prior, mean , var
        self.priors = np.zeros(n_classes, dtype=np.float64)
        self.mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self.var = np.zeros((n_classes,n_features), dtype=np.float64)

        # Calulate priors and likelihoods for each class
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.mean[idx] = X_c.mean(axis=0)
            self.var[idx] = X_c.var(axis=0)
            self.priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        """ take a matrix X and output the predicted label based on the model trained
        :param X: shape(n_sample, n_features)
        :return predicted label class
        """
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        """
        compute the posterior probability for each class and return the class with the highest probability.
        :param x: a single input sample(count or frequency features)
        :return: predicted class label
        """
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self.classes):
            prior = np.log(self.priors[idx])
            likelihood = np.sum(np.log(self._gaussian_pdf(idx, x)))
            posterior = prior + likelihood
            posteriors.append(posterior)

        # return the class with the highest posterior probability
        return self.classes[np.argmax(posteriors)]
    
    def _gaussian_pdf(self, c_idx, x):
        """
        Compute the Gaussian probability density function for a given feature and class.
        :param class_idx: The index of the class
        :param x: A single input sample
        :return: Probability density for each feature
        """
        mean = self.mean[c_idx]
        var = self.var[c_idx]
        numerator = np.exp(- (x - mean) ** 2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

    def accuracy(self, y_pred, y_test):
        """
        testing the accuracy of the results
        :param y_pred: the returned class predicted
        :param y_test: the true label from test set
        :return: the accuracy percentage
        """
        accuracy = sum(y_pred == y_test) / len(y_test)
        return accuracy




## Example Usage

In [17]:
from sklearn import datasets
from sklearn.model_selection import train_test_split


X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Train a Multinomial Naive Bayes classifier
clf = GaussianNB()
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = clf.accuracy(y_pred, y_test)
accuracy


0.965