# Spam classifier with Naive Bayes (BoW & TF-IDF)


In [333]:
# imports
import csv
import string
import random
import math

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from itertools import islice, chain
from collections import defaultdict
from functools import reduce

# Preprocess

In [334]:
path = "data/SMSSpamCollection.csv"

ps = PorterStemmer()
sw = stopwords.words('english')


def preprocess(text):
    cleaned_text = text.replace('\W+', '').replace('\s+', '').strip()
    # lower
    cleaned_text = cleaned_text.lower()
    # tokenize
    tokenized_text = cleaned_text.split()
    # stopwords
    stopwords_removed = [token for token in tokenized_text if token not in sw]
    # stemming
    stemmed_text = [ps.stem(token) for token in stopwords_removed]
    punc_removed = [token.translate(str.maketrans('', '', string.punctuation)) for token in stemmed_text]
    return [token for token in punc_removed if token != '']


def read_sms_data(file_path):
    sms_data = list()
    with open(file_path, "r") as csvfile:
        csv_reader = csv.DictReader(csvfile, fieldnames=["label", "sms"], dialect="excel-tab")
        for row in csv_reader.reader:
            # replace label with 0 (ham) or 1 (spam)
            label = 1 if row[0].lower() == "spam" else 0
            sms = row[1]
            cleaned_sms = preprocess(sms)
            if len(cleaned_sms) != 0:
                sms_data.append(dict(label=label, sms=cleaned_sms))
    return sms_data

sms_data_list = read_sms_data(path)
print("SMS messages:", len(sms_data_list))

SMS messages: 5569


# Split sms list into train and test subsets

In [335]:
def test_and_train_split(data, test_size=0.2, train_size=0.8, shuffle=True):
    data_ = data
    if shuffle:
        data_ = random.sample(data, len(data))
    train_and_test = [
        list(
            islice(iter(data_), elm)) for elm in [
            round((len(data) * train_size)),
            round((len(data) * test_size))
        ]
    ]
    x_train = train_and_test[0]
    x_test = list(map(lambda x: x.get("sms"), train_and_test[1]))
    y_test_label = list(map(lambda x: x.get("label"), train_and_test[1]))
    return x_train, x_test, y_test_label

X_train, X_test, y_test = test_and_train_split(sms_data_list)

print("Size of training set:", len(X_train))
print("Size of testing set:", len(X_test))

Size of training set: 4455
Size of testing set: 1114


In [336]:
X_train_ham = list(map(lambda y: y.get("sms"), filter(lambda x: x.get("label") == 0, X_train)))
X_train_spam = list(map(lambda y: y.get("sms"), filter(lambda x: x.get("label") == 1, X_train)))

print("Total number of Ham messages in train:", len(X_train_ham))
print("Total number of Spam messages in train:", len(X_train_spam))

Total number of Ham messages in train: 3850
Total number of Spam messages in train: 605


In [337]:
def create_index(vocab):
    index_dict = dict()
    i = 0
    for word in vocab:
        index_dict[word] = i
        i += 1
    return index_dict


def counter(sentences, vocab):
    count_dict = dict()
    for word in vocab:
        count_dict[word] = 0
        for sent in sentences:
            if word in sent:
                count_dict[word] += 1
    return count_dict

# BoW

In [338]:
vocab_train_spam = set(list(chain.from_iterable(X_train_spam)))
vocab_train_ham = set(list(chain.from_iterable(X_train_ham)))

print("Size of ham vocab:", len(vocab_train_ham))
print("Size of spam vocab:", len(vocab_train_spam))

Size of ham vocab: 6266
Size of spam vocab: 2592


In [339]:
def bag_of_words(sentences, index_word, vocab):
    bow_list = list()
    for sent in sentences:
        count_dict = defaultdict(int)
        vec = [float(0)] * len(vocab)
        for word in sent:
            count_dict[word] += 1.0
        for word, count in count_dict.items():
            vec[index_word[word]] = count
        bow_list.append(vec)
    return bow_list

bow_index_spam = create_index(vocab_train_spam)
bow_index_ham = create_index(vocab_train_ham)

bow_vector_spam = bag_of_words(X_train_spam, bow_index_spam, vocab_train_spam)
bow_vector_ham = bag_of_words(X_train_ham, bow_index_ham, vocab_train_ham)

# TF-IDF

In [340]:
# Term Frequency
def compute_tf(sentence, word):
    N = len(sentence)
    occ = len([token for token in sentence if token == word])
    return occ / N


# Inverse Document Frequency
def compute_idf(word, count_dict, no_of_sentences):
    try:
        word_occ = count_dict[word] + 1
    except KeyError:
        word_occ = 1
    return math.log(no_of_sentences / word_occ)


# TF IDF combined
def compute_tfidf(sentence, vocab, count_dict, index_dict, no_of_sentences):
    tf_idf_vec = [float(0)] * len(vocab)
    for word in sentence:
        tf = compute_tf(sentence, word)
        idf = compute_idf(word, count_dict, no_of_sentences)

        value = tf * idf
        tf_idf_vec[index_dict[word]] = value
    return tf_idf_vec

# Create word counts
count_spam = counter(X_train_spam, vocab_train_spam)
count_ham = counter(X_train_ham, vocab_train_ham)

# Create index
tfidf_index_spam = create_index(vocab_train_spam)
tfidf_index_ham = create_index(vocab_train_ham)

In [341]:
def create_output_vector(sentences, vocab, count_dict, index_dict):
    vector_list = []
    for sentence in sentences:
        vec = compute_tfidf(sentence, vocab, count_dict, index_dict, len(sentences))
        vector_list.append(vec)
    return vector_list

vector_spam = create_output_vector(X_train_spam, vocab_train_spam, count_spam, tfidf_index_spam)
vector_ham = create_output_vector(X_train_ham, vocab_train_ham, count_ham, tfidf_index_ham)

In [342]:
sum_of_spam = sum(list(chain.from_iterable(vector_spam)))
sum_of_ham = sum(list(chain.from_iterable(vector_ham)))

In [343]:
p_spam = len(X_train_spam) / (len(X_train_spam) + (len(X_train_ham)))
p_ham = len(X_train_ham) / (len(X_train_ham) + (len(X_train_spam)))

In [344]:
def classifier(message, method="tfidf", verbose=False):
    p_w_spam, p_w_ham = 0.0, 0.0
    for word in message:

        if method.lower() == "tfidf":
            try:
                p_w_spam += math.log((sum(list(vector[tfidf_index_spam[word]] for vector in vector_spam)) + 1) /  sum_of_spam + 2)
            except KeyError:
                p_w_spam += math.log(1 / (len(vector_spam) + 2)) #len(vector_ham) + 2))

            try:
                p_w_ham += math.log(sum(list(vector[tfidf_index_ham[word]] for vector in vector_ham)) + 1 / sum_of_ham + 2)
            except KeyError:
                p_w_ham += math.log(1 / (len(vector_ham) + 2)) #len(vector_ham) + 2))
        else:
            try:
                p_w_spam += math.log((sum(list(vector[bow_index_spam[word]] for vector in bow_vector_spam)) + 1))
            except KeyError:
                p_w_spam += math.log(1 / (len(bow_vector_spam) + 2)) #len(bow_vector_ham) + 2))

            try:
                p_w_ham += math.log((sum(list(vector[bow_index_ham[word]] for vector in bow_vector_ham)) + 1))
            except KeyError:
                p_w_ham += math.log(1 / (len(bow_vector_ham) + 2)) #len(bow_vector_spam) + 2))

        p_w_spam +=  math.log(p_spam)
        p_w_spam += math.log(p_ham)

    if verbose:
        print("Model: {0}, spam score: {1}".format(method.upper(), p_w_spam))
        print("Model: {0}, ham score: {1}".format(method.upper(), p_w_ham))

    if p_w_spam >= p_w_ham:
        return 1
    return 0

In [345]:
def predict(test_data, method):
    result = dict()
    for i, message in enumerate(test_data):
        result[i] = classifier(message, method)
    return result

In [346]:
def print_matrix(matrix):
    labels = ["spam", "ham"]

    print("Confusion Matrix:")
    l = max(
        reduce(lambda n, x : len("%s"%x) if n < len("%s"%x) else n, [0] + list(chain(matrix))),
        reduce(lambda n, x : len("%s"%x) if n < len("%s"%x) else n, [0] + list(chain(labels)))
    )

    print("\t",eval("\"%%%is\"%%\"%s\""%(l, "")), end=" ")

    for column in labels:
        print(eval("\"%%%is\"%%\"%s\""%(l, column)), end=" ")
    print()

    i = -0
    for row in matrix:
        print("\t",eval("\t\"%%%is\"%%\"%s\""%(l, labels[i] if i >= 0 else "" )), end=" ")
        i += 1
        for column in row:
            print(eval("\"%%%is\"%%%s"%(l, column)), end=" ")
        print()

In [347]:
def metrics(labels, predictions):
    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
    for i in range(len(labels)):
        true_pos += int(labels[i] == 1 and predictions[i] == 1)
        true_neg += int(labels[i] == 0 and predictions[i] == 0)
        false_pos += int(labels[i] == 0 and predictions[i] == 1)
        false_neg += int(labels[i] == 1 and predictions[i] == 0)

    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    fall_out = false_pos / (false_pos + true_neg)
    mcc = (true_pos * true_neg - false_pos * false_neg) / math.sqrt((true_pos + false_pos) * (true_pos + false_neg) * (true_neg + false_pos) * (true_neg + false_neg))
    accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)

    print()
    print_matrix([[true_pos, false_neg], [false_pos, true_neg]])
    print("\nClassification report:")
    print()
    print("Precision:\t", round(precision, 2))
    print("Recall:\t\t", round(recall, 2))
    print("Fall-Out:\t", round(fall_out, 2))
    print("MCC:\t\t", round(mcc, 2))
    print("Accuracy:\t", round(accuracy, 2))

# Predict messages and print classification report

In [348]:
tfidf_prediction = predict(X_test, "tfidf")

In [349]:
print("Total test messages:", len(X_test))
metrics(y_test, tfidf_prediction)

Total test messages: 1114

Confusion Matrix:
	              spam      ham 
	     spam       94       63 
	      ham        0      957 

Classification report:

Precision:	 1.0
Recall:		 0.6
Fall-Out:	 0.0
MCC:		 0.75
Accuracy:	 0.94


In [350]:
bow_prediction = predict(X_test, "bow")

In [351]:
print("Total test messages:", len(X_test))
metrics(y_test, bow_prediction)

Total test messages: 1114

Confusion Matrix:
	                spam       ham 
	      spam       139        18 
	       ham         0       957 

Classification report:

Precision:	 1.0
Recall:		 0.89
Fall-Out:	 0.0
MCC:		 0.93
Accuracy:	 0.98
