In [145]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
pd.set_option('display.max_colwidth', 200)

In [58]:
df = pd.read_csv('spam.csv', encoding='latin-1', names=['y', 'x', 'u0', 'u1', 'u2'], skiprows=1)[['y', 'x']]

In [59]:
df_train, df_test = train_test_split(df, test_size=0.3)

In [60]:
df_train.to_csv('train.csv')
df_test.to_csv('test.csv')

In [73]:
pipe1 = Pipeline(
    [('vectorizer', CountVectorizer()),
     ('classifier', MultinomialNB())])

In [75]:
pipe1.fit(df_train.x, df_train.y)
pipe1.score(df_train.x, df_train.y)

0.9935897435897436

In [64]:
len(pipe.named_steps.vectorizer.vocabulary_)

7188

In [76]:
pipe2 = Pipeline(
    [('vectorizer', CountVectorizer())])

In [245]:
X = pd.DataFrame(pipe2.fit_transform(df_train.x[102:104]).todense(), index=df_train[102:104].index)
y = (df_train.y[102:104] == 'spam').astype(int)

In [318]:
def pr_ll(X, y):
    cs = np.sort(y.unique())
    ll = np.zeros((len(cs), X.shape[1]))
    for c in cs:
        Xc = X[y==c]
        per_word_counts = np.sum(Xc, axis=0)
        per_word_prob = per_word_counts / np.sum(np.sum(per_word_counts))
        per_word_prob[per_word_prob == 0] = 0.0001
        ll[c, :] = per_word_prob

    pr = np.array([[np.sum(y == cs[0]), np.sum(y == cs[1])]])/y.shape[0]
    return (pr, ll)
    
(pr, ll) = pr_ll(X, y)

X0 = X[0:1]
y0 = y[0:1]


def predict(X, pr, ll):
    cs = np.sort(y.unique())
    xcps = []
    for Xi in X.values:
        cps = []
        for c in range(pr.shape[1]):    
            ps = []
            for i, xi in enumerate(Xi):
                ps.append(ll[c][i]*xi or 0.0000001)
            ps = np.exp(np.sum(np.log(ps)))
            cps.append(ps)
        cps = cps/np.sum(cps)
        xcps.append(cps)
    return np.array(xcps)
np.argmax(predict(X, pr, ll), axis=0)

array([1, 0])

In [319]:
class NB:
    def __init__(self, **args):
        pass
    
    def fit(self, X, y):
        (pr, ll) = pr_ll(X, y)
        self.pr = pr
        self.ll = ll
        
    def predict(self, X):
        return np.argmax(predict(X, self.pr, self.ll), axis=0)

In [320]:
nb = NB()
nb.fit(X, y)
nb.predict(X)

array([1, 0])