# Mutinomial Naive Bayes From Scratch

In [1]:
import numpy as np

In [20]:
class MutinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        """
        Multinomial Naive Bayes classifier.
        :param alpha: Smoothing parameter to aviod 0 probability(Laplace smoothing)
        """
        self.alpha = alpha

    def fit(self, X, y):
        """
        Fit the Multinomial Naive Bayes model.
        :param X: Training data (count or frequency features), shape (n_samples, n_features)
        :param y: Target values (class labels), shape (n_samples,)
        """
        n_samples, n_features = X.shape
        self.classes = np.unique(y)  # Get unique class labels
        n_classes = len(self.classes)

        # Initialize priors, likelihood numerator (feature count for each class), and denominator (total count per class)
        self.priors = np.zeros(n_classes, dtype=np.float64)
        self.likelihood_numerators = np.zeros((n_classes, n_features), dtype=np.float64)
        self.likelihood_denominators = np.zeros(n_classes, dtype=np.float64)

        # Calulate priors and likelihoods for each class
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.priors[idx] = X_c.shape[0] / float(n_samples) # Prior: P(C_k)
            self.likelihood_numerators[idx,:] = X_c.sum(axis=0) + self.alpha # Count of each feature in the class + alpha
            self.likelihood_denominators[idx] = X_c.sum() + self.alpha * n_features # Total count of all features in the class

    def predict(self, X):
        """
        predict class label for input samples.
        :param X: input data (count or frequency features), shape (n_samples, n_features)
        :return: predicted class labels, shape (n_samples,)
        """
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)
    
    def _predict(self, x):
        """
        compute the posterior probability for each class and return the class with the highest probability.
        :param x: a single input sample(count or frequency features)
        :return: predicted class label
        """
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self.classes):
            prior = np.log(self.priors[idx])
            likelihood = np.sum(np.log(self.likelihood_numerators[idx, :] / self.likelihood_denominators[idx]) * x)
            posterior = prior + likelihood
            posteriors.append(posterior)

        # return the class with the highest posterior probability
        return self.classes[np.argmax(posteriors)]
    
    def accuracy(self, y_pred, y_test):
        """
        testing the accuracy of the results
        :param y_pred: the returned class predicted
        :param y_test: the true label from test set
        :return: the accuracy percentage
        """
        accuracy = sum(y_pred == y_test) / len(y_test)
        return accuracy


## Example Usage

In [23]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load dataset
newsgroups_data = fetch_20newsgroups(subset='all')

# Create TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups_data.data)
y = newsgroups_data.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Multinomial Naive Bayes classifier
clf = MutinomialNaiveBayes(alpha=1.0)
clf.fit(X_train, y_train)
# Convert the test set to a dense array
X_test_dense = X_test.toarray()
# Predict on the test set
y_pred = clf.predict(X_test_dense)

# Calculate accuracy
accuracy = clf.accuracy(y_pred, y_test)
accuracy



0.8445623342175066