In [9]:
import numpy as np
from sklearn.model_selection import train_test_split

## Step 1: Load Data

In [2]:
spambase = open('spambase.data','r')
data = []
for line in spambase:
    line = [float(element) for element in line.rstrip('\n').split(',')]
    data.append(np.asarray(line))

## Step 2: Split Data

In [4]:
features = 48
X = [data[i][:features] for i in range(len(data))]
y = [int(data[i][-1]) for i in range(len(data))]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Step 3: Likelihood Estimation

In [8]:
X_train_ham = [X_train[i] for i in range(len(X_train)) if y_train[i]==0]
X_train_spam = [X_train[i] for i in range(len(X_train)) if y_train[i]==1]

#Find the class specific likelihoods of each feature
likelihoods_ham = np.mean(X_train_ham, axis=0)/100.0
likelihoods_spam = np.mean(X_train_spam, axis=0)/100.0

## Step 4: Prior Estimation

In [11]:
num_ham = float(len(X_train_ham))
num_spam = float(len(X_train_spam))

prior_ham = num_ham / (num_ham + num_spam)
prior_spam = num_spam / (num_ham + num_spam)

log_prior_ham = np.log10(prior_ham)
log_prior_spam = np.log10(prior_spam)

In [32]:
def calculate_log_likelihoods_with_naive_bayes(feature_vector, Class):
    
    assert len(feature_vector) == features
    log_likelihood = 0.0 #using log-likelihood to avoid underflow
    if Class==0:
        for feature_index in range(len(feature_vector)):
            if feature_vector[feature_index] == 1: #feature present
                log_likelihood += np.log10(likelihoods_ham[feature_index]) 
            elif feature_vector[feature_index] == 0: #feature absent
                log_likelihood += np.log10(1.0 - likelihoods_spam[feature_index])
    elif Class==1:
        for feature_index in range(len(feature_vector)):
            if feature_vector[feature_index] == 1: #feature present
                log_likelihood += np.log10(likelihoods_spam[feature_index]) 
            elif feature_vector[feature_index] == 0: #feature absent
                log_likelihood += np.log10(1.0 - likelihoods_spam[feature_index])
    else:
        raise ValueError("Class takes integer values 0 or 1")
        
    return log_likelihood

In [33]:
def calculate_class_posteriors(feature_vector):
    
    log_likelihood_ham = calculate_log_likelihoods_with_naive_bayes(feature_vector, Class=0)
    log_likelihood_spam = calculate_log_likelihoods_with_naive_bayes(feature_vector, Class=1)
    
    log_posterior_ham = log_likelihood_ham + log_prior_ham
    log_posterior_spam = log_likelihood_spam + log_prior_spam
    
    return log_posterior_ham, log_posterior_spam

In [34]:
def classify(document_vector):
    
    feature_vector = [int(element > 0.0) for element in document_vector]
    log_posterior_ham, log_posterior_spam = calculate_class_posteriors(feature_vector)
    
    if log_posterior_ham > log_posterior_spam:
        return 0
    else:
        return 1

## Step 5: Prediction

In [35]:
predictions = []
for email in X_test:
    predictions.append(classify(email))

## Step 6: Evaluate Performance

In [36]:
def evaluate_performance(predictions, labels):
    
    correct_count = 0.0
    
    for i in range(len(predictions)):
        if predictions[i] == labels[i]:
            correct_count += 1.0
            
    accuracy = correct_count/len(predictions)
    
    return accuracy

In [37]:
accuracy_of_naive_bayes = evaluate_performance(predictions, y_test)
print(accuracy_of_naive_bayes * 100, "%")

89.3136403127715 %
