#### Logistic Regression Spam Classification

In [1]:
# Imports

import numpy as np
import math
from prepare_data import *
from statistics import *

In [2]:
def calc_sigmoid(x, theta):
    return 1 / (1 + np.exp(-x @ theta))

def calc_cost(sigmoid, train_y, N):
    return -1/N * (train_y.T @ np.log(sigmoid) + np.subtract(1, train_y).T @ np.subtract(1, np.log(sigmoid)))

def logistic_regression(s_train_x, train_y):
    
    # initialize the parameters of theta using random values in the range [-1, 1]
    thetas = np.random.uniform(-1, 1, (s_train_x.shape[1], 1))

    learning_rate = 0.01
    change_termination = math.pow(2, -23)
    current_iteration, max_iterations = 0, 1500
    N = s_train_x.shape[0]

    previous_cost = 0

    while current_iteration < max_iterations:

        # update each parameter using batch gradient descent
        gradient = s_train_x.T @ np.subtract(train_y, calc_sigmoid(s_train_x, thetas))
        thetas += learning_rate/N * gradient

        current_cost = calc_cost(calc_sigmoid(s_train_x, thetas), train_y, N)

        # if absolute value change in the loss on the data is less than 2^(−23) terminate loop    
        if np.abs(current_cost - previous_cost) < change_termination:
            break

        previous_cost = current_cost
        current_iteration += 1   
    
    return thetas

In [3]:
train_x, test_x, train_y, test_y = split_data()
s_train_x, s_test_x = standardize_data(train_x, test_x, True)

thetas = logistic_regression(s_train_x, train_y)
sigmoid = calc_sigmoid(s_test_x, thetas)

for prediction in range(0, len(sigmoid)):
    if sigmoid[prediction] < 0.5:
        sigmoid[prediction] = 0
    else:
        sigmoid[prediction] = 1
        
TP, TN, FP, FN = confusion_matrix(sigmoid, test_y)      
print("accuracy: ", calc_accuracy(TP, TN, test_y.shape[0]) * 100, "%")
print("precision: ", calc_precision(TP, FP) * 100, "%")
print("recall: ", calc_recall(TP, FN) * 100, "%")
print("f_measure: ", calc_f_measure(TP, FP, FN) * 100, "%")


accuracy:  88.25831702544032 %
precision:  84.13793103448276 %
recall:  84.72222222222221 %
f_measure:  84.42906574394463 %
