In [1]:
from sklearn.preprocessing import Binarizer
from sklearn.utils.validation import check_X_y, check_array
import pandas as pd
import numpy as np

class BernoulliNB:
    def __init__(self):
        self.prob_class = None
        self.prob_x_c = None
        self.prob_x_cp = None

    def fit(self, X, y):
        X = self.binarize(X)
        classes = np.unique(y)
        probs_class = []
        probs_x_c = []
        probs_x_cp = []

        for c in classes:
            prob_class, prob_x_c, prob_x_cp = self.populate_class_stats(X, y, c)
            probs_class.append(prob_class)
            probs_x_c.append(prob_x_c)
            probs_x_cp.append(prob_x_cp)

        self.prob_class = np.asarray(probs_class)
        self.prob_x_c = np.asarray(probs_x_c)
        self.prob_x_cp = np.asarray(probs_x_cp)

    def binarize(self, X):
        binarizer = Binarizer().fit(X)
        return binarizer.transform(X)

    def populate_class_stats(self, X, y, c):
        X, y = check_X_y(X, y, accept_sparse="csr")
        Xc = X[y == c, :]
        Xcp = X[y != c, :]

        # P(c)
        prob_class = Xc.shape[0] / X.shape[0]

        # number of sample where x is in class c
        Nxc = np.asarray(Xc.sum(axis=0))

        # number of sample where x not in class c
        Nxcp = np.asarray(Xcp.sum(axis=0))

        n_feat = Nxc.shape[1]

        # number of samples in class c
        Nsc = Xc.shape[0]

        # number of samples not in class c
        Nscp = Xcp.shape[0]

        # P(x | c) with lapace smoothing
        prob_x_c = (Nxc + 1) / (Nsc + 2)

        # P(x | c') with lapace smoothing
        prob_x_cp = (Nxcp + 1) / (Nscp  + 2)

        return prob_class, prob_x_c.reshape(n_feat), prob_x_cp.reshape(n_feat)

    def predict(self, X):
        X = check_array(X, accept_sparse="csr")
        X = self.binarize(X)
        one = np.ones(X.shape[1])
        l1 = np.log(self.prob_x_c/self.prob_x_cp).T
        l2 = np.log((1-self.prob_x_c)/(1-self.prob_x_cp)).T

        pred = np.log(self.prob_class/(1-self.prob_class)) + \
                + (X @ l1 + (one - X) @ l2)

        return np.argmax(pred, axis=1)

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectPercentile, chi2

# Read Data
df = pd.read_csv('reddit_train.csv')

# Pre- processing 
tfidf_vectorizer = TfidfVectorizer(binary=True, max_df=0.1,smooth_idf=False, stop_words=['english','http','www'],
                sublinear_tf=True)
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(df['comments'])

# Target labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['subreddits'])
y = df['label'].to_numpy()

# Feature reduction
select_best = SelectPercentile(chi2, percentile=23)
X = select_best.fit_transform(tfidf, y)

# Model initilize
clf = BernoulliNB()

# Cross validation
kf = KFold(n_splits=5, random_state=7, shuffle=True)  
acc = []
for train_index, valid_index in kf.split(X):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    clf.fit(X_train, y_train)
    y_p = clf.predict(X_valid)
    acc.append( accuracy_score(y_valid, y_p))
accuracy = sum(acc)/float(len(acc))
print('Mean cross validation accuracy: {:.5f}'.format(accuracy))

Mean cross validation accuracy: 0.54601
