# Selecting Naive Bayes's Hyperparameters:
 I have considered here multiple combinations of:
  * **Alpha** is the constant used in laplace smoothing. 
  * **Number Fold** is the numbers of foldes that training data are split into. This is used in cross valdiation excercise. 

Plug a series of combination in `create_classifier` in order to measure the weighted overall accuracy on unseen data. I then select the hyperparameters that got the best accuracy to go into the final module that I used for prediction on test data.  
 

In [1]:
#_naivebayes|selecting_hyperparameters
import numpy as np
import itertools
import time 

start_time = time.process_time()

training_spam = np.loadtxt(open("data/training_spam.csv"), delimiter=",").astype(int)
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)

class SpamClassifier: 
    def __init__(self, training_data, alpha=1):
        self.alpha = alpha
        self.training_data = training_data
        
    def train(self):
        self.log_class_priors = SpamClassifier.estimate_log_class_priors(self.training_data)
        self.log_class_conditional_likelihoods = SpamClassifier.estimate_log_class_conditional_likelihoods(self.training_data, self.alpha)
        
    @staticmethod
    def estimate_log_class_priors(data):
        data_labels = data[:, 0]
        log_p_0 = np.log(np.where(data_labels==0)[0].shape[0]/len(data_labels))
        log_p_1 = np.log(np.where(data_labels==1)[0].shape[0]/len(data_labels))
        log_class_priors = np.array([log_p_0, log_p_1])
        return log_class_priors
    
    @staticmethod
    def estimate_log_class_conditional_likelihoods(data, alpha=1.0):
        n_c_0 =  np.count_nonzero(data[:,1:][data[:,0]==0])
        n_c_1 =  np.count_nonzero(data[:,1:][data[:,0]==1])
        k = data.shape[1]-1
        n_c_w = np.zeros(shape=(2,training_spam.shape[1]-1))
        for i in range(1,training_spam.shape[1]):
            n_c_w[0,i-1] = training_spam[training_spam[:,0]==0][training_spam[training_spam[:,0]==0][:,i]==1].shape[0]
            n_c_w[1,i-1] = training_spam[training_spam[:,0]==1][training_spam[training_spam[:,0]==1][:,i]==1].shape[0]
        theta = np.full_like(n_c_w, 0)
        theta[0,:] = np.log((n_c_w[0,:] + alpha) / (n_c_0 + k*alpha))
        theta[1,:] = np.log((n_c_w[1,:] + alpha) / (n_c_1 + k*alpha))
        return theta


    def predict(self, new_data):
        class_predictions=np.zeros(shape=(new_data.shape[0]))
        for i in range(new_data.shape[0]):
            c_hat_0 = self.log_class_priors[0]+ np.sum(np.dot(new_data[i], self.log_class_conditional_likelihoods[0]))
            c_hat_1 = self.log_class_priors[1]+ np.sum(np.dot(new_data[i], self.log_class_conditional_likelihoods[1]))
            if c_hat_0>c_hat_1: 
                class_predictions[i]=0
            else: 
                class_predictions[i]=1
        return class_predictions


def cross_validation(training_spam, item):
    max_accuracy = 0
    averaged_accuracy = 0
    alpha, k_fold=item
    x_training= training_spam[:,1:]
    y_training= training_spam[:,0]
    x = np.split(x_training, x_training.shape[0]/k_fold, axis=0)
    y = np.split(y_training, y_training.shape[0]/k_fold, axis=0)
    for i in range(len(x)):
        x_temp=np.concatenate(x[0:i]+x[i+1:])
        y_temp=np.concatenate(y[0:i]+y[i+1:])
        y_temp=y_temp.reshape(y_temp.shape[0],1)
        temp_data = np.concatenate((y_temp,x_temp), axis=1)
        classifier = SpamClassifier(temp_data,alpha)
        classifier.train()
        final_y_predict_temp = classifier.predict(x[i])
        accuracy = np.count_nonzero(final_y_predict_temp == y[i].reshape(final_y_predict_temp.shape))/final_y_predict_temp.shape[0]
        
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_classifier= classifier
            max_fold = i
        averaged_accuracy += accuracy
    averaged_accuracy = averaged_accuracy/len(x)
    return max_classifier, max_fold, max_accuracy, averaged_accuracy

def create_classifier(training_spam, item):
    classifier = cross_validation(training_spam, item)
    return classifier

alpha_O = [0.1, 1, 1.5, 2, 2.5, 3]
k_fold = [10,20,25, 50, 100, 200, 500]
output_list = list(itertools.product(alpha_O, k_fold))
max_weighted_accuracy=0
for item in output_list:
    
    classifier, max_fold, max_accuracy_train, averaged_accuracy_train = create_classifier(training_spam, item)
    end_time = time.process_time()
    
    test_data = testing_spam[:, 1:]
    test_labels = testing_spam[:, 0]

    predictions = classifier.predict(test_data)
    accuracy = np.count_nonzero(predictions == test_labels)/test_labels.shape[0]
    
    
    
    weighted_accuracy = accuracy * test_data.shape[0] / (test_data.shape[0]+(training_spam.shape[0]/item[-1])) \
    + averaged_accuracy_train * (training_spam.shape[0]/item[-1]) / (test_data.shape[0]+(training_spam.shape[0]/item[-1]))
    

    if weighted_accuracy > max_weighted_accuracy:
            dic={}
            
            max_weighted_accuracy=weighted_accuracy
            max_accuracy = accuracy
            maxi_accuracy_train = max_accuracy_train
            averaged_accuracy_train = averaged_accuracy_train
        
            max_item = item
            
            max_time = int(end_time-start_time)
    elif weighted_accuracy == max_weighted_accuracy:
        dic[item]=[weighted_accuracy, accuracy, averaged_accuracy_train, max_accuracy_train]
        
         
    print("&&"*50)
    print(f"Weighted accuracy is {weighted_accuracy} when alpha and k-folds are {item} respectively.")
    print()
    print(f"Accuracy on test data is: {accuracy}, averaged accuracy on train data is: {averaged_accuracy_train}")
    print(f"This took", int(end_time-start_time), "seconds to solve.")
    print()
    print()
    print(f"**The best overall accuracy on test data is so far standing at: {max_weighted_accuracy} when alpha and k-folds are {max_item}")
    print(f"Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {dic}")
    print()
    print()
    print()





print()
print('^'*250)
print(f"When alpha and k-folds are {max_item} respectively, the expected overall weighted accuracy on unseen data is: {max_weighted_accuracy}")
print(f"This module is {max_accuracy} accurat on test data")
print(f"Whereas, averaged accuracy on train data is {averaged_accuracy_train}, and topped at {maxi_accuracy_train}")
print(f"This module takes {max_time} seconds to solve.")
print(f"Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {dic}")

# pass on the result to the final module below
alpha, k_fold = max_item

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8955000000000001 when alpha and k-folds are (0.1, 10) respectively.

Accuracy on test data is: 0.896, averaged accuracy on train data is: 0.8930000000000003
This took 1 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.8955000000000001 when alpha and k-folds are (0.1, 10)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8956363636363637 when alpha and k-folds are (0.1, 20) respectively.

Accuracy on test data is: 0.896, averaged accuracy on train data is: 0.8920000000000001
This took 2 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.8956363636363637 when alpha an

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9005000000000001 when alpha and k-folds are (1.5, 10) respectively.

Accuracy on test data is: 0.902, averaged accuracy on train data is: 0.8930000000000003
This took 8 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.9079402390438247 when alpha and k-folds are (0.1, 500)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9011818181818181 when alpha and k-folds are (1.5, 20) respectively.

Accuracy on test data is: 0.902, averaged accuracy on train data is: 0.8930000000000001
This took 9 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.9079402390438247 when alpha a

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9005000000000001 when alpha and k-folds are (2.5, 10) respectively.

Accuracy on test data is: 0.902, averaged accuracy on train data is: 0.8930000000000003
This took 15 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.9079402390438247 when alpha and k-folds are (0.1, 500)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.901090909090909 when alpha and k-folds are (2.5, 20) respectively.

Accuracy on test data is: 0.902, averaged accuracy on train data is: 0.892
This took 15 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.9079402390438247 when alpha and k-folds a

#  Final Module 

When plugging the hyper parameters associated with the best average weighted accuracy I got when running the above code, I got a **0.908** accuracy when run on test data. 

In [2]:
#_naivebayes
class SpamClassifier: 
    def __init__(self, training_data, alpha=1):
        self.alpha = alpha
        self.training_data = training_data
        
    def train(self):
        self.log_class_priors = SpamClassifier.estimate_log_class_priors(self.training_data)
        self.log_class_conditional_likelihoods = SpamClassifier.estimate_log_class_conditional_likelihoods(self.training_data, self.alpha)
        
    @staticmethod
    def estimate_log_class_priors(data):
        data_labels = data[:, 0]
        log_p_0 = np.log(np.where(data_labels==0)[0].shape[0]/len(data_labels))
        log_p_1 = np.log(np.where(data_labels==1)[0].shape[0]/len(data_labels))
        log_class_priors = np.array([log_p_0, log_p_1])
        return log_class_priors
    
    @staticmethod
    def estimate_log_class_conditional_likelihoods(data, alpha=1.0):
        n_c_0 =  np.count_nonzero(data[:,1:][data[:,0]==0])
        n_c_1 =  np.count_nonzero(data[:,1:][data[:,0]==1])
        k = data.shape[1]-1
        n_c_w = np.zeros(shape=(2,training_spam.shape[1]-1))
        for i in range(1,training_spam.shape[1]):
            n_c_w[0,i-1] = training_spam[training_spam[:,0]==0][training_spam[training_spam[:,0]==0][:,i]==1].shape[0]
            n_c_w[1,i-1] = training_spam[training_spam[:,0]==1][training_spam[training_spam[:,0]==1][:,i]==1].shape[0]
        theta = np.full_like(n_c_w, 0)
        theta[0,:] = np.log((n_c_w[0,:] + alpha) / (n_c_0 + k*alpha))
        theta[1,:] = np.log((n_c_w[1,:] + alpha) / (n_c_1 + k*alpha))
        return theta


    def predict(self, new_data):
        class_predictions=np.zeros(shape=(new_data.shape[0]))
        for i in range(new_data.shape[0]):
            c_hat_0 = self.log_class_priors[0]+ np.sum(np.dot(new_data[i], self.log_class_conditional_likelihoods[0]))
            c_hat_1 = self.log_class_priors[1]+ np.sum(np.dot(new_data[i], self.log_class_conditional_likelihoods[1]))
            if c_hat_0>c_hat_1: 
                class_predictions[i]=0
            else: 
                class_predictions[i]=1

        return class_predictions
    
def cross_validation(training_spam):
    
    x_training= training_spam[:,1:]
    y_training= training_spam[:,0]
    x = np.split(x_training, x_training.shape[0]/k_fold, axis=0)
    y = np.split(y_training, y_training.shape[0]/k_fold, axis=0)
    
    max_accuracy = 0
    averaged_accuracy = 0
    for i in range(len(x)):
        x_temp=np.concatenate(x[0:i]+x[i+1:])
        y_temp=np.concatenate(y[0:i]+y[i+1:])
        y_temp=y_temp.reshape(y_temp.shape[0],1)
        temp_data = np.concatenate((y_temp,x_temp), axis=1)
        classifier = SpamClassifier(temp_data,alpha)
        classifier.train()
        final_y_predict_temp = classifier.predict(x[i])
        accuracy = np.count_nonzero(final_y_predict_temp == y[i].reshape(final_y_predict_temp.shape))/final_y_predict_temp.shape[0]
        
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_classifier= classifier
            max_fold = i
        
    return max_classifier

def create_classifier(training_spam):
    classifier = cross_validation(training_spam)
    return classifier



classifier = create_classifier(training_spam)

In [3]:
# run on test data
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)
test_data = testing_spam[:, 1:]
test_labels = testing_spam[:, 0]

predictions = classifier.predict(test_data)
accuracy = np.count_nonzero(predictions == test_labels)/test_labels.shape[0]
print(f"Accuracy on test data is: {accuracy}")

Accuracy on test data is: 0.908


In [5]:
def confusion_matrix(test_labels, predictions):
    tp = tn = fp = fn = 0
    for actual_value, predicted_value in zip(test_labels, predictions):
        if predicted_value == actual_value:
            if predicted_value == 1: 
                tp += 1
            else: 
                tn += 1
        else: 
            if predicted_value == 1:
                fp += 1
            else: 
                fn += 1
    confusion_matrix = np.array([[tn, fp], [fn, tp]])
    print('___Confusion Matrix and Statistics___'.center(80))
    print('*****************************************'.center(80))
    print()
    print('           Reference   ')
    print ('Prediction   0    1')
    print('  0       ', confusion_matrix[0, 0],'  ',  confusion_matrix[0, 1])
    print('  1       ', confusion_matrix[1, 0], '  ', confusion_matrix[1, 1])
    
    
    accuracy = (tn+tp) / test_labels.shape[0]
    print()
    precision_positive =  tp / (tp+fp)
    precision_negative = tn / (tn+fn)
    recall_positive =  tp/ (tp+fn)
    recall_negative = tn/ (tn+fp)
    
    f1_positive = 2*(precision_positive * recall_positive)/(precision_positive+recall_positive)
    f1_negative = 2*(precision_negative * recall_negative)/(precision_negative+recall_negative)
    print()
    print()
    print('         Precision   Recall  f1-score   Support  ')
    print ('      0 ', np.round(precision_negative,2),'      ', np.round(recall_negative,2),'  ', np.round(f1_negative,2), '     ', tn+fp )
    print ('      1 ', np.round(precision_positive,2),'      ', np.round(recall_positive,2), '  ', np.round(f1_positive,2),'      ', tp+fn )
    print()
    print('accuracy                    ', accuracy,'    ',test_labels.shape[0] )

confusion_matrix(test_labels, predictions)



                     ___Confusion Matrix and Statistics___                      
                   *****************************************                    

           Reference   
Prediction   0    1
  0        276    25
  1        21    178



         Precision   Recall  f1-score   Support  
      0  0.93        0.92    0.92       301
      1  0.88        0.89    0.89        199

accuracy                     0.908      500
