In [13]:
import numpy as np
import scipy as sp
import statistics as st

from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [3]:
# open docs file and read its lines
with open("../data/SMSSpamCollection", "r", encoding="utf8") as fh:
    lines = fh.readlines()  

In [4]:
len(lines)

5574

In [5]:
# dedupe the original data
lines = list(set(lines))
len(lines)

5171

In [6]:
# split data
labels = []
text = []

for line in lines:
    labels.append(line.split('\t')[0])
    text.append(line.split('\t')[1])

In [7]:
for i in range(len(labels)):
    if labels[i] == 'ham':
        labels[i] = 0
    elif labels[i] == 'spam':
        labels[i] = 1

In [8]:
# utilities

def build_split(data, labels, indices):
    d = []
    l = []
    for i in indices:
        d.append(data[i])
        l.append(labels[i])
    return (d, l)

def evaluate(real, pred):
    a = accuracy_score(real, pred)
    p = precision_score(real, pred)
    r = recall_score(real, pred)
    
    # print("accuracy: ", a, '\n', "precision: ", p, '\n', "recall: ", r)
    return(a,p,r)

def list_uniques(l):
    unique_values = set(l)
    for value in unique_values:
        count = l.count(value)
        print(f'{value}: {count}')

In [9]:
list_uniques(labels)

0: 4518
1: 653


In [20]:
# train and evaluate 3 models for each split
m_stats = []
c_stats = []
b_stats = []
v_stats = []

k = 5

kf = KFold(n_splits=k, shuffle=True)

for fold, (train_idx, test_idx) in enumerate(kf.split(text)):
    x_train, y_train = build_split(text, labels, train_idx)
    x_test, y_test = build_split(text, labels, test_idx)
    
    # build models
    vectorizer = CountVectorizer()
    mat_train = vectorizer.fit_transform(x_train)
    
    mnb = MultinomialNB()
    mnb.fit(mat_train, y_train)

    cnb = ComplementNB()
    cnb.fit(mat_train, y_train)

    bnb = BernoulliNB()
    bnb.fit(mat_train, y_train)
    
    # evaluate
    mat_test = vectorizer.transform(x_test)

    mnb_pred = mnb.predict(mat_test)
    cnb_pred = cnb.predict(mat_test)
    bnb_pred = bnb.predict(mat_test)
    voted = [st.mode([mnb_pred[i], cnb_pred[i], bnb_pred[i]]) for i in range(len(bnb_pred))]

    # save evaluation stats
    m_stats.append(evaluate(y_test, mnb_pred))
    c_stats.append(evaluate(y_test, cnb_pred))
    b_stats.append(evaluate(y_test, bnb_pred))
    v_stats.append(evaluate(y_test, voted))

In [21]:
m_eval = np.mean(m_stats, axis=0)
c_eval = np.mean(c_stats, axis=0)
b_eval = np.mean(b_stats, axis=0)
v_eval = np.mean(v_stats, axis=0)

print("Multinomial - a: ", m_eval[0], " | p: ", m_eval[1], " | r: ", m_eval[2])
print("Complement  - a: ", c_eval[0], " | p: ", c_eval[1], " | r: ", c_eval[2])
print("Bernoulli   - a: ", b_eval[0], " | p: ", b_eval[1], " | r: ", b_eval[2])
print("Voted       - a: ", v_eval[0], " | p: ", v_eval[1], " | r: ", v_eval[2])

Multinomial - a:  0.984916883917809  | p:  0.9724215996622874  | r:  0.9065574191554099
Complement  - a:  0.9771808744241677  | p:  0.8933965999482243  | r:  0.9295468753130361
Bernoulli   - a:  0.9729276109849654  | p:  0.9965517241379309  | r:  0.7881086284262668
Voted       - a:  0.9851103075154878  | p:  0.9724399146806025  | r:  0.9083431334411243
