In [1]:
import re

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict

seed = 42
np.random.seed(seed)

In [2]:
df = pd.read_csv('data/spam.csv', encoding='ISO-8859-1')

df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df['v1'] = df.v1.map({'ham':0, 'spam': 1})

clean = lambda text: re.sub("[^a-zA-Z0-9\s]+", " ", text)
df['v2'] = df['v2'].apply(clean)
df['v2'] = df['v2'].apply(str.lower)

In [3]:
y = df['v1'].values
X = df['v2'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

$$\prod_{i=1}^n P(w_i | class) * P(class)$$

$$\log{P(class)} +  \sum_{i=1}^n \log P(w_i | class)$$

In [4]:
# For text classification:

# P(class | w_1, ..., w_n) = (P(w_1, ..., w_n | class) * P(class)) / P(w_1, ..., w_n)
# \prod_{i=1}^n P(w_i | class) * P(class) / P(w_1, ..., w_n)
# \log{P(class)} +  \sum_{i=1}^n \log P(w_i | class)

# We also apply laplace smoothing to prevent to take log of 0.

class MultinomialNB:
    
    def __init__(self):
        pass
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_probs = self.class_probabilities(X, y)
        grouped_data = self.group_data_by_class(X, y)
        self.vocabs = self.get_vocabs(grouped_data)
        self.total_vocab_len = sum([len(self.vocabs[c]) for c, _ in self.vocabs.items()])
        
    def predict(self, X):
        preds = []
        for sentence in X:
            probs = []
            for c in self.classes:
                # Current sentence probability is: 
                # logP(class) + and sum(logP(w_1, ..., w_n | class))
                log_class_prob = np.log(self.class_probs[c]) 
                log_word_probs = sum([np.log(self.laplace_smoothed_prob(c, word)) 
                                     for word in sentence.split()])
                sentence_prob = log_class_prob + log_word_probs
                probs.append(sentence_prob)
                
            preds.append(np.argmax(probs))
        
        return preds
    
    def class_probabilities(self, X, y):
        return { c: (y == c).sum() / len(y) for c in self.classes }
    
    def group_data_by_class(self, X, y):
        return { c: X[np.where(y == c)] for c in self.classes }
    
    def laplace_smoothed_prob(self, c, word):
        return (self.vocabs[c][word] + 1) / (len(self.vocabs[c]) + self.total_vocab_len)
    
    def get_vocabs(self, grouped_data):
        vocabs = { c: defaultdict(lambda : 0) for c, _ in grouped_data.items()}
        
        for c, data in grouped_data.items():
            for sentence in data:
                word_counts = Counter(sentence.split())
                for word, count in word_counts.items():
                    vocabs[c][word] += count
        return vocabs

In [5]:
mnb = MultinomialNB()

mnb.fit(X_train, y_train)
preds = mnb.predict(X_test)
print("Accuracy: ", (preds == y_test).mean())

Accuracy:  0.9623318385650225


In [6]:
# Sklearn way
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer().fit(X_train)
X_train_tr = vectorizer.transform(X_train)
X_test_tr = vectorizer.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train_tr, y_train)

preds = mnb.predict(X_test_tr)
print("Accuracy: ", (preds == y_test).mean())

Accuracy:  0.9838565022421525
