In [1]:
import numpy as np
import scipy as sp

In [3]:
# open docs file and read its lines
with open("../data/SMSSpamCollection", "r", encoding="utf8") as fh:
    lines = fh.readlines()  

In [4]:
# split data
labels = []
text = []

for line in lines:
    labels.append(line.split('\t')[0])
    text.append(line.split('\t')[1])

In [5]:
for i in range(len(labels)):
    if labels[i] == 'ham':
        labels[i] = 0
    elif labels[i] == 'spam':
        labels[i] = 1

In [6]:
def list_uniques(l):
    unique_values = set(l)
    for value in unique_values:
        count = l.count(value)
        print(f'{value}: {count}')

In [7]:
list_uniques(labels)

0: 4827
1: 747


In [8]:
# text = [line.split() for line in text]

In [9]:
from sklearn.model_selection import train_test_split

train_txt, test_txt, train_labels, test_labels = train_test_split(text, labels, test_size=0.2)


In [10]:
print(len(train_txt))
print(len(test_txt))

list_uniques(train_labels)
list_uniques(test_labels)

4459
1115
0: 3865
1: 594
0: 962
1: 153


In [11]:
print(test_txt[0])

You have been selected to stay in 1 of 250 top British hotels - FOR NOTHING! Holiday Worth £350! To Claim, Call London 02072069400. Bx 526, SW73SS



In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB

vectorizer = CountVectorizer()

mat_train = vectorizer.fit_transform(train_txt)

In [13]:
mnb = MultinomialNB()
mnb.fit(mat_train, train_labels)

cnb = ComplementNB()
cnb.fit(mat_train, train_labels)

bnb = BernoulliNB()
bnb.fit(mat_train, train_labels)

In [32]:
mat_test = vectorizer.transform(test_txt)

mnb_pred = mnb.predict(mat_test)
cnb_pred = cnb.predict(mat_test)
bnb_pred = bnb.predict(mat_test)

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def evaluate(real, pred):
    a = accuracy_score(real, pred)
    p = precision_score(real, pred)
    r = recall_score(real, pred)
    
    print("accuracy: ", a, '\n', "precision: ", p, '\n', "recall: ", r)

In [34]:
evaluate(test_labels, mnb_pred)

accuracy:  0.9856502242152466 
 precision:  0.9507042253521126 
 recall:  0.9375


In [35]:
evaluate(test_labels, cnb_pred)

accuracy:  0.9802690582959641 
 precision:  0.8961038961038961 
 recall:  0.9583333333333334


In [36]:
evaluate(test_labels, bnb_pred)

accuracy:  0.9820627802690582 
 precision:  1.0 
 recall:  0.8611111111111112
