In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.naive_bayes import MultinomialNB as skMultinomialNB

In [2]:
class MultinomialNB():
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def _encode(self, y):
        classes = np.unique(y)
        y_train = np.zeros((y.shape[0], len(classes)))
        for i, c in enumerate(classes):
            y_train[y == c, i] = 1
        return classes, y_train

    def fit(self, X, y):
        self.classes_, y_train = self._encode(y)
        self.feature_count_ = np.dot(y_train.T, X)
        smoothed_fc = self.feature_count_ + self.alpha
        self.feature_log_prob_ = (np.log(smoothed_fc) -
                                  np.log(smoothed_fc.sum(axis=1).reshape(-1, 1)))
        self.class_count_ = y_train.sum(axis=0)
        self.class_log_prior_ = np.log(self.class_count_) - np.log(self.class_count_.sum())
        return self

    def _joint_log_likelihood(self, X):
        return np.dot(X, self.feature_log_prob_.T) + self.class_log_prior_

    def predict(self, X):
        joint_log_likelihood = self._joint_log_likelihood(X)
        return self.classes_[np.argmax(joint_log_likelihood, axis=1)]

In [3]:
X, y = fetch_20newsgroups_vectorized(return_X_y=True)
# we do not support sparse very well, pick part of the dataset and convert to dense
X, y = X[y < 3].toarray(), y[y < 3]
clf1 = MultinomialNB().fit(X, y)
clf2 = skMultinomialNB().fit(X, y)
assert np.allclose(clf1.feature_log_prob_, clf2.feature_log_prob_)
assert np.allclose(clf1.class_log_prior_, clf2.class_log_prior_)
prob1 = clf1._joint_log_likelihood(X)
prob2 = clf2._joint_log_likelihood(X)
assert np.allclose(prob1, prob2)
pred1 = clf1.predict(X)
pred2 = clf2.predict(X)
assert np.array_equal(pred1, pred2)