#### Naive Bayes Spam Classification

In [1]:
# imports

import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import norm
from statistics import *

In [2]:
# read in the data
X = np.array(np.genfromtxt('spambase.data', delimiter=','))

# seed random number generator with zero
np.random.seed(0)

In [3]:
# create y matrix (last column) and convert from 1D to 2D matrix
y_mat = np.reshape(X[:, -1], (-1, 1))

# spam data
spam_x = X[(X[:, -1] == 1).nonzero()]
spam_y = np.reshape(spam_x[:, -1], (-1, 1))
spam_x = np.array(spam_x[:, :-1])

# non-spam data
non_spam_x = X[(X[:, -1] == 0).nonzero()]
non_spam_y = np.reshape(non_spam_x[:, -1], (-1, 1))
non_spam_x = np.array(non_spam_x[:, :-1])

# create x matrix - all rows/column excluding last one
x_mat = X[:, :-1]

In [4]:
# split data into training and testing data
train_x, test_x, train_y, test_y = train_test_split(x_mat, y_mat, test_size=0.33)
spam_x_train, spam_x_test, spam_y_train, spam_y_test = train_test_split(spam_x, spam_y, test_size=0.33)
non_spam_x_train, non_spam_x_test, non_spam_y_train, non_spam_y_test = train_test_split(non_spam_x, non_spam_y, test_size=0.33)

In [5]:
mean = np.mean(train_x, axis=0)
std = np.std(train_x, axis=0, ddof=1)

spam_prior = spam_x.shape[0] / x_mat.shape[0]
non_spam_prior = non_spam_x.shape[0] / x_mat.shape[0]

# standardize spam and non spam data
spam_x_train = np.divide(np.subtract(spam_x_train, mean), std)
non_spam_x_train = np.divide(np.subtract(non_spam_x_train, mean), std)

spam_x_train_mean = np.mean(spam_x_train, axis=0)
spam_x_train_std = spam_x_train.std(axis=0)
non_spam_x_train_mean = np.mean(non_spam_x_train, axis=0)
non_spam_x_train_std = non_spam_x_train.std(axis=0)

# standardize test data
s_test_x = np.divide(np.subtract(test_x, mean), std)

In [6]:
spam_norm = norm.pdf(s_test_x, spam_x_train_mean, spam_x_train_std)
spam_norm = np.add(spam_norm, np.finfo(float).eps)

non_spam_norm = norm.pdf(s_test_x, non_spam_x_train_mean, non_spam_x_train_std)
non_spam_norm = np.add(non_spam_norm, np.finfo(float).eps)

prob_spam = np.prod(spam_norm, axis=1) * spam_prior
prob_non_spam = np.prod(non_spam_norm, axis=1) * non_spam_prior

In [7]:
y_pred = [1 if prob_spam[i] > prob_non_spam[i] else 0 for i in range(prob_spam.shape[0])]

TP, TN, FP, FN = confusion_matrix(y_pred, test_y)   
print("accuracy: ", calc_accuracy(TP, TN, test_y.shape[0]) * 100, "%")
print("precision: ", calc_precision(TP, FP) * 100, "%")
print("recall: ", calc_recall(TP, FN) * 100, "%")
print("f_measure: ", calc_f_measure(TP, FP, FN) * 100, "%")

accuracy:  76.30019749835418 %
precision:  63.665594855305464 %
recall:  96.58536585365853 %
f_measure:  76.74418604651163 %
