# Selecting Linear's Hyperparameters:
 I have considered here multiple combinations of:
  * **Threshold** is the threshold used in classifying the prediction into 0 and 1. 
  * **Number Fold** is the numbers of foldes that training data are split into. This is used in cross valdiation excercise. 

I plug a serie of combinations in `create_classifier` in order to measure the weighted overall accuracy on unseen data. I, then, select the hyperparameters that got the best accuracy to go into the final module that I used for prediction on test data.  
 

In [1]:
# Multivariate Linear classifiers with a hard threshold |selecting_hyperparameters
import numpy as np
import itertools
import time 

start_time = time.process_time()

training_spam = np.loadtxt(open("data/training_spam.csv"), delimiter=",").astype(int)
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)
test_data = testing_spam[:, 1:]
test_labels = testing_spam[:, 0]

class SpamClassifier:
    def __init__(self,threshold_classifier):
        self.threshold_classifier= threshold_classifier
        
    def train(self, training_data):
        n_samples, n_features = training_data[:,1:].shape
        x= training_data[:,1:]
        y= training_data[:,0]
        x=np.concatenate((np.ones((training_data.shape[0],1), dtype =int),x), axis=1)
        #normal equation
        self.weights= np.dot(np.dot(np.linalg.inv(np.dot(x.T, x)), x.T), y)

    
    def predict(self, new_data):
        new_data = np.concatenate((np.ones((new_data.shape[0],1), dtype =int),new_data), axis=1)
        linear_model = np.dot(self.weights.T, new_data.T)
        
        return np.where(linear_model <= self.threshold_classifier,0,1)
    

def cross_validation(training_spam, item):
    max_accuracy = 0
    averaged_accuracy = 0
    threshold, k_fold=item
    x_training= training_spam[:,1:]
    y_training= training_spam[:,0]
    x = np.split(x_training, x_training.shape[0]/k_fold, axis=0)
    y = np.split(y_training, y_training.shape[0]/k_fold, axis=0)
    for i in range(len(x)):
        x_temp=np.concatenate(x[0:i]+x[i+1:])
        y_temp=np.concatenate(y[0:i]+y[i+1:])
        y_temp=y_temp.reshape(y_temp.shape[0],1)
        temp_data = np.concatenate((y_temp,x_temp), axis=1)
        classifier = SpamClassifier(threshold)
        classifier.train(temp_data)
        final_y_predict_temp = classifier.predict(x[i])
        accuracy = np.count_nonzero(final_y_predict_temp == y[i].reshape(final_y_predict_temp.shape))/final_y_predict_temp.shape[0]
        
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_classifier= classifier
        averaged_accuracy += accuracy
    averaged_accuracy = averaged_accuracy/len(x)
    return max_classifier, max_accuracy, averaged_accuracy

def create_classifier(training_spam, item):
    classifier = cross_validation(training_spam, item)
    return classifier

i = 0.4
threshold= [i+0.01*k for k in range(30)]
k_fold = [10,20,25, 50, 100, 200, 500]
output_list = list(itertools.product(threshold, k_fold))
max_weighted_accuracy=0
for item in output_list:
    classifier, max_accuracy_train, averaged_accuracy_train = create_classifier(training_spam, item)
    end_time = time.process_time()
    
    test_data = testing_spam[:, 1:]
    test_labels = testing_spam[:, 0]

    predictions = classifier.predict(test_data)
    accuracy = np.count_nonzero(predictions == test_labels)/test_labels.shape[0]
    
    
    
    weighted_accuracy = accuracy * test_data.shape[0] / (test_data.shape[0]+(training_spam.shape[0]/item[-1])) \
    + averaged_accuracy_train * (training_spam.shape[0]/item[-1]) / (test_data.shape[0]+(training_spam.shape[0]/item[-1]))
    

    if weighted_accuracy > max_weighted_accuracy:
            dic={}
            
            max_weighted_accuracy=weighted_accuracy
            max_accuracy = accuracy
            maxi_accuracy_train = max_accuracy_train
            averaged_accuracy_train = averaged_accuracy_train
        
            max_item = item
            
            max_time = int(end_time-start_time)
    elif weighted_accuracy == max_weighted_accuracy:
        dic[item]=[weighted_accuracy, accuracy, averaged_accuracy_train, max_accuracy_train]
        
         
    print("&&"*50)
    print(f"Weighted accuracy is {weighted_accuracy} when classification threshold and k-folds are {item} respectively.")
    print()
    print(f"Accuracy on test data is: {accuracy}, averaged accuracy on train data is: {averaged_accuracy_train}")
    print(f"This took", int(end_time-start_time), "seconds to solve.")
    print()
    print()
    print(f"**The best overall accuracy on test data is so far standing at: {max_weighted_accuracy} when classification threshold and k-folds are {max_item}")
    print(f"Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {dic}")
    print()
    print()
    print()





print()
print('^'*250)
print(f"When classification threshold and k-folds are {max_item} respectively, the expected overall weighted accuracy on unseen data is: {max_weighted_accuracy}")
print(f"This module is {max_accuracy} accurat on test data")
print(f"Whereas, averaged accuracy on train data is {averaged_accuracy_train}, and topped at {maxi_accuracy_train}")
print(f"This module takes {max_time} seconds to solve.")
print(f"Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {dic}")

# pass on the result to the final module below
threshold, k_fold = max_item

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.911 when classification threshold and k-folds are (0.4, 10) respectively.

Accuracy on test data is: 0.91, averaged accuracy on train data is: 0.9160000000000003
This took 1 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.911 when classification threshold and k-folds are (0.4, 10)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9105454545454545 when classification threshold and k-folds are (0.4, 20) respectively.

Accuracy on test data is: 0.91, averaged accuracy on train data is: 0.916
This took 2 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.911 when class

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9118333333333334 when classification threshold and k-folds are (0.42000000000000004, 10) respectively.

Accuracy on test data is: 0.91, averaged accuracy on train data is: 0.9210000000000004
This took 9 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9071818181818182 when classification threshold and k-folds are (0.42000000000000004, 20) respectively.

Accuracy on test data is: 0.906, averaged accuracy on train data is: 0.919
This took 10 seconds to solve.


**The

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9125000000000001 when classification threshold and k-folds are (0.44, 10) respectively.

Accuracy on test data is: 0.91, averaged accuracy on train data is: 0.9250000000000004
This took 16 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9113636363636364 when classification threshold and k-folds are (0.44, 20) respectively.

Accuracy on test data is: 0.91, averaged accuracy on train data is: 0.9250000000000003
This took 17 seconds to solve.


**The best overall acc

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9115000000000001 when classification threshold and k-folds are (0.46, 10) respectively.

Accuracy on test data is: 0.908, averaged accuracy on train data is: 0.9290000000000004
This took 24 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9063636363636364 when classification threshold and k-folds are (0.46, 20) respectively.

Accuracy on test data is: 0.904, averaged accuracy on train data is: 0.9300000000000004
This took 25 seconds to solve.


**The best overall a

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9115000000000001 when classification threshold and k-folds are (0.48000000000000004, 10) respectively.

Accuracy on test data is: 0.908, averaged accuracy on train data is: 0.9290000000000005
This took 31 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9099090909090909 when classification threshold and k-folds are (0.48000000000000004, 20) respectively.

Accuracy on test data is: 0.908, averaged accuracy on train data is: 0.9290000000000004
This took 32 seconds to

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9108333333333335 when classification threshold and k-folds are (0.5, 10) respectively.

Accuracy on test data is: 0.908, averaged accuracy on train data is: 0.9250000000000006
This took 39 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9096363636363637 when classification threshold and k-folds are (0.5, 20) respectively.

Accuracy on test data is: 0.908, averaged accuracy on train data is: 0.9260000000000003
This took 39 seconds to solve.


**The best overall acc

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.9001666666666668 when classification threshold and k-folds are (0.52, 10) respectively.

Accuracy on test data is: 0.896, averaged accuracy on train data is: 0.9210000000000005
This took 46 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8982727272727273 when classification threshold and k-folds are (0.52, 20) respectively.

Accuracy on test data is: 0.896, averaged accuracy on train data is: 0.9210000000000003
This took 47 seconds to solve.


**The best overall a

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8985000000000001 when classification threshold and k-folds are (0.54, 10) respectively.

Accuracy on test data is: 0.896, averaged accuracy on train data is: 0.9110000000000005
This took 54 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8955454545454545 when classification threshold and k-folds are (0.54, 20) respectively.

Accuracy on test data is: 0.894, averaged accuracy on train data is: 0.911
This took 54 seconds to solve.


**The best overall accuracy on te

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8995000000000001 when classification threshold and k-folds are (0.56, 10) respectively.

Accuracy on test data is: 0.898, averaged accuracy on train data is: 0.9070000000000005
This took 61 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8950909090909092 when classification threshold and k-folds are (0.56, 20) respectively.

Accuracy on test data is: 0.894, averaged accuracy on train data is: 0.9060000000000001
This took 62 seconds to solve.


**The best overall a

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8966666666666668 when classification threshold and k-folds are (0.5800000000000001, 10) respectively.

Accuracy on test data is: 0.896, averaged accuracy on train data is: 0.9000000000000006
This took 69 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.899909090909091 when classification threshold and k-folds are (0.5800000000000001, 20) respectively.

Accuracy on test data is: 0.9, averaged accuracy on train data is: 0.899
This took 69 seconds to solve.


**The be

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8861666666666668 when classification threshold and k-folds are (0.6000000000000001, 10) respectively.

Accuracy on test data is: 0.884, averaged accuracy on train data is: 0.8970000000000005
This took 76 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8848181818181817 when classification threshold and k-folds are (0.6000000000000001, 20) respectively.

Accuracy on test data is: 0.884, averaged accuracy on train data is: 0.893
This took 77 seconds to solve.


**The

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8878333333333334 when classification threshold and k-folds are (0.62, 10) respectively.

Accuracy on test data is: 0.888, averaged accuracy on train data is: 0.8870000000000005
This took 84 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8861818181818182 when classification threshold and k-folds are (0.62, 20) respectively.

Accuracy on test data is: 0.886, averaged accuracy on train data is: 0.888
This took 85 seconds to solve.


**The best overall accuracy on te

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8858333333333334 when classification threshold and k-folds are (0.64, 10) respectively.

Accuracy on test data is: 0.886, averaged accuracy on train data is: 0.8850000000000002
This took 91 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8857272727272727 when classification threshold and k-folds are (0.64, 20) respectively.

Accuracy on test data is: 0.886, averaged accuracy on train data is: 0.883
This took 92 seconds to solve.


**The best overall accuracy on te

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8738333333333335 when classification threshold and k-folds are (0.66, 10) respectively.

Accuracy on test data is: 0.874, averaged accuracy on train data is: 0.8730000000000001
This took 99 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8738181818181818 when classification threshold and k-folds are (0.66, 20) respectively.

Accuracy on test data is: 0.874, averaged accuracy on train data is: 0.8719999999999998
This took 100 seconds to solve.


**The best overall 

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8703333333333333 when classification threshold and k-folds are (0.68, 10) respectively.

Accuracy on test data is: 0.87, averaged accuracy on train data is: 0.872
This took 106 seconds to solve.


**The best overall accuracy on test data is so far standing at: 0.916076923076923 when classification threshold and k-folds are (0.41000000000000003, 50)
Modules with similar accuracy (if any) have the following hyperparameters (hyperparameters:[weight accuracy, accuracy, max_accuracy_train, max_time]) : {}



&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
Weighted accuracy is 0.8701818181818182 when classification threshold and k-folds are (0.68, 20) respectively.

Accuracy on test data is: 0.87, averaged accuracy on train data is: 0.8719999999999999
This took 107 seconds to solve.


**The best overall accuracy on te

#  Final Module 

When plugging the hyper parameters associated with the best average weighted accuracy I got when running the above code, I got a **0.914** accuracy when run on test data. 

In [2]:
class SpamClassifier:
    def __init__(self,threshold_classifier):
        self.threshold_classifier= threshold_classifier
        
    def train(self, training_data):
        n_samples, n_features = training_data[:,1:].shape
        x= training_data[:,1:]
        y= training_data[:,0]
        x=np.concatenate((np.ones((training_data.shape[0],1), dtype =int),x), axis=1)
        #normal equation
        self.weights= np.dot(np.dot(np.linalg.inv(np.dot(x.T, x)), x.T), y)

    
    def predict(self, new_data):
        new_data = np.concatenate((np.ones((new_data.shape[0],1), dtype =int),new_data), axis=1)
        linear_model = np.dot(self.weights.T, new_data.T)
        return np.where(linear_model <= self.threshold_classifier,0,1)
    

def cross_validation(training_spam):
    max_accuracy = 0
    averaged_accuracy = 0
    x_training= training_spam[:,1:]
    y_training= training_spam[:,0]
    x = np.split(x_training, x_training.shape[0]/k_fold, axis=0)
    y = np.split(y_training, y_training.shape[0]/k_fold, axis=0)
    for i in range(len(x)):
        x_temp=np.concatenate(x[0:i]+x[i+1:])
        y_temp=np.concatenate(y[0:i]+y[i+1:])
        y_temp=y_temp.reshape(y_temp.shape[0],1)
        temp_data = np.concatenate((y_temp,x_temp), axis=1)
        classifier = SpamClassifier(threshold)
        classifier.train(temp_data)
        final_y_predict_temp = classifier.predict(x[i])
        accuracy = np.count_nonzero(final_y_predict_temp == y[i].reshape(final_y_predict_temp.shape))/final_y_predict_temp.shape[0]
        
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_classifier= classifier
    return max_classifier

def create_classifier():
    classifier = cross_validation(training_spam)
    return classifier


classifier = create_classifier()

In [3]:
# run on test data
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)
test_data = testing_spam[:, 1:]
test_labels = testing_spam[:, 0]

predictions = classifier.predict(test_data)
accuracy = np.count_nonzero(predictions == test_labels)/test_labels.shape[0]
print(f"Accuracy on test data is: {accuracy}")

Accuracy on test data is: 0.916


In [71]:
def confusion_matrix(test_labels, predictions):
    tp = tn = fp = fn = 0
    for actual_value, predicted_value in zip(test_labels, predictions):
        if predicted_value == actual_value:
            if predicted_value == 1: 
                tp += 1
            else: 
                tn += 1
        else: 
            if predicted_value == 1:
                fp += 1
            else: 
                fn += 1
    confusion_matrix = np.array([[tn, fp], [fn, tp]])
    print('___Confusion Matrix and Statistics___'.center(80))
    print('*****************************************'.center(80))
    print()
    print('           Reference   ')
    print ('Prediction   0    1')
    print('  0       ', confusion_matrix[0, 0],'  ',  confusion_matrix[0, 1])
    print('  1       ', confusion_matrix[1, 0], '  ', confusion_matrix[1, 1])
    
    
    accuracy = (tn+tp) / test_labels.shape[0]
    print()
    precision_positive =  tp / (tp+fp)
    precision_negative = tn / (tn+fn)
    recall_positive =  tp/ (tp+fn)
    recall_negative = tn/ (tn+fp)
    
    f1_positive = 2*(precision_positive * recall_positive)/(precision_positive+recall_positive)
    f1_negative = 2*(precision_negative * recall_negative)/(precision_negative+recall_negative)
    print()
    print()
    print('         Precision   Recall  f1-score   Support  ')
    print ('      0 ', np.round(precision_negative,2),'      ', np.round(recall_negative,2),'  ', np.round(f1_negative,2), '     ', tn+fp )
    print ('      1 ', np.round(precision_positive,2),'      ', np.round(recall_positive,2), '  ', np.round(f1_positive,2),'      ', tp+fn )
    print()
    print('accuracy                    ', accuracy,'    ',test_labels.shape[0] )

confusion_matrix(test_labels, predictions)



                     ___Confusion Matrix and Statistics___                      
                   *****************************************                    

           Reference   
Prediction   0    1
  0        277    24
  1        18    181



         Precision   Recall  f1-score   Support  
      0  0.94        0.92    0.93       301
      1  0.88        0.91    0.9        199

accuracy                     0.916      500
