# Spam Classifier

_Assignment for the University of Bath as part of MSc in Artificial Intelligence_ 

_Data Source: University of Bath department of Computer Science_


## Data
The training set consists of 1000 rows and 55 columns. Each row corresponds to one email message. The first column is the response variable and describes whether a message is spam `1` or ham `0`. The remaining 54 columns are features corresponding to 54 different keywords within the email message, including special characters (such as ":", "!", and "$"). A feature has the value `1` if the keyword appears in the message and `0` otherwise. The messages are therefore represented using a binary bag-of-words model.  There is also a 500 row set of test data which is functionally identical.

In [7]:
import numpy as np

training_spam = np.loadtxt(open("data/training_spam.csv"), delimiter=",").astype(int)
print("Shape of the spam training data set:", training_spam.shape)
print(training_spam)

Shape of the spam training data set: (1000, 55)
[[1 0 0 ... 0 0 0]
 [0 0 1 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [1 1 1 ... 1 1 0]
 [1 0 0 ... 1 1 1]]


In [8]:
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)
print("Shape of the spam testing data set:", testing_spam.shape)
print(testing_spam)

Shape of the spam testing data set: (500, 55)
[[1 0 0 ... 1 1 1]
 [1 1 0 ... 1 1 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]]


## Naive Bayes Classifier

The following code implements a naive bayes classifier from scratch without using external libraries.

In [10]:
class SpamClassifier:
    def __init__(self, alpha):
        self.log_class_priors = 0
        self.theta = 0
        self.alpha = alpha
        
    def train(self, data):
        # Estimate log class priors:
        response_variable = data[:, 0]
        zero_count = 0
        one_count = 0
        for element in response_variable:
            if element == 0:
                zero_count += 1
            elif element == 1:
                one_count += 1
            else:
                raise ValueError("Non-binary response found")
        self.log_class_priors = np.log([zero_count / len(response_variable), one_count / len(response_variable)])
        
        # Estimate log conditional likelihoods:
        theta = np.zeros([2, len(data[0])-1])
        spam = np.array([])
        ham = np.array([])
        for row in data:
            if row[0] == 0:
                if len(ham) == 0:
                    ham = row[1:]
                else:
                    ham = np.vstack((ham, row[1:]))
            if row[0] == 1:
                if len(spam) == 0:
                    spam = row[1:]
                else:
                    spam = np.vstack((spam, row[1:]))
        ham_totals = ham.sum(axis=0)
        for i in range(len(theta[0])):
            theta[0, i] = np.log((ham_totals[i] + self.alpha) / (ham_totals.sum() + len(ham) * self.alpha))
        spam_totals = spam.sum(axis=0)
        for i in range(len(theta[1])):
            theta[1, i] = np.log((spam_totals[i] + self.alpha) / (spam_totals.sum() + len(spam) * self.alpha))
        self.theta = theta
        
        
    def predict(self, data):
        class_predictions = np.zeros([len(data)])
        for i in range(len(data)):
            ham_likelihood = self.log_class_priors[0]
            spam_likelihood = self.log_class_priors[1]
            for j in range(len(data[i])):
                ham_likelihood += data[i, j] * self.theta[0, j]
                spam_likelihood += data[i, j] * self.theta[1, j]
            if spam_likelihood > ham_likelihood:
                class_predictions[i] = 1
            else:
                class_predictions[i] = 0

        return class_predictions

        
def create_classifier(data, alpha):
    classifier = SpamClassifier(alpha)
    classifier.train(data)
    return classifier

## K-fold cross validation
The k-fold cross validation code below is used to determine the optimal value of the smoothing parameter alpha. 
To optimise alpha I used k = 10 and the training dataset.

In [11]:
def kfold_cross_validation(data, k=10, min_alpha=0.01, max_alpha=1, alpha_step=0.01):
    test_alpha = min_alpha
    best_acc = 0
    while test_alpha <= max_alpha:
        n = 0
        classifier = SpamClassifier(alpha=test_alpha)
        accuracy_list = np.empty(k)
        divided_data = np.split(data, k)
        while n < k:
            test_data = divided_data[n]
            test_data_labels = test_data[:, 0]
            test_data_minus_labels = test_data[:, 1:]
            training_data = np.vstack(divided_data[:n] + divided_data[n+1:])
            classifier.train(training_data)
            prediction = classifier.predict(test_data_minus_labels)
            accuracy = np.mean(np.equal(prediction, test_data_labels))
            accuracy_list[n] = accuracy
            n += 1
        average_accuracy = np.average(accuracy_list)
        if average_accuracy > best_acc:
            best_acc = average_accuracy
            best_alpha = test_alpha
        test_alpha += alpha_step
    return best_alpha, best_acc

In [12]:
best_alpha, best_acc = kfold_cross_validation(training_spam)
print(best_alpha, best_acc)

0.02 0.8899999999999999


## Classifier Accuracy on test data

In [13]:
classifier = create_classifier(training_spam, best_alpha)
test_data_labels = testing_spam[:, 0]
test_data_minus_labels = testing_spam[:, 1:]
prediction = classifier.predict(test_data_minus_labels)
accuracy = np.mean(np.equal(prediction, test_data_labels))
print(accuracy)

0.896
