# Logistic Regression for Spam Classification

In [None]:
import pandas as pd
import numpy as np
import math
import sklearn.metrics as skm
import matplotlib.pyplot as plt

### 1: get data + set up model

In [None]:
# import in data
test = pd.read_csv("test-1.csv")
train = pd.read_csv("train-1.csv")

# split into output and input
y = train['label']
X = train.drop(['label'], axis = 1)

y_test = test['label']
X_test = test.drop(['label'], axis = 1)

derivative of sigmoid: 


### fit_logistic_regression: 
fits the model w/ func sigmoid and by updating weights w/ stochastic gradient descent <br>
inputs: epochs (# of epochs), learning_rate (learning rate/eta), X (input features), y (target feature)<br>
output/return: weights of all features

In [None]:
# epsilon for calculating cross-entropy loss
EPSILON = 0.0000001

def fit_logistic_regression (epochs, learning_rate, X, y):
    # initialize values
    weights = [0] * (len(X.columns)) # vector of 0s
    num_samples = X.shape[0]
    losses = []

    for epoch in range(0, int(epochs)): # run set number of epochs
        epoch_loss = 0
        for i in range(0, num_samples):  # go through each sample in X
            y_true = y[i]
            x_values = X.iloc[i]
            y_lin_pred = np.dot(weights, x_values)              # get linear portion of logistic regression
            y_pred = 1/(1 + pow(math.e, -np.sum(y_lin_pred)))   # calculate final prediction
            deriv = x_values * (y_pred - y_true)                # deriv of loss: formula calculated by hand! see md
            weights = weights - learning_rate * deriv
            # calculate binary cross-entropy loss bc it is a classification problem w/ binary outputs
            epoch_loss = epoch_loss + (-(y_true * math.log(y_pred + EPSILON, math.e) + (1-y_true) * math.log((1-y_pred) + EPSILON, math.e)))
        # loss for the final plot
        loss_point = epoch_loss/num_samples
        losses.append(loss_point)
        # print info about each epoch
        print("Epoch " + str(epoch) + ": cross-entroy loss = " + str(loss_point))
    
    return weights, losses

### predict_logistic_regression:
print_stats: prints accuracy, precision, recall, f1, and confusion mtx based on given y_test + predictions <br>
predict_logistic_regression: predicts X test classification based on given weights, compares it w/ the actual test classifications, and prints stats. <br>
inputs: weights (from fit function), X_test, y_test <br>
outputs: nothing (but prints stats)


In [None]:
def print_stats(y_test, y_preds):
    print("Accuracy: " + str(skm.accuracy_score(y_test, y_preds)))
    print("Precision: " + str(skm.precision_score(y_test, y_preds)))
    print("Recall: "+ str(skm.recall_score(y_test, y_preds)))
    print("F1: " + str(skm.f1_score(y_test, y_preds)))
    print("Confusion Matrix: " + str(skm.confusion_matrix(y_test, y_preds)))

In [47]:
def predict_logistic_regression (weights, X_test, y_test):
    num_samples = X_test.shape[0]
    y_preds = []
    for i in range(0, num_samples):                 # loop through all samples
        lin_total = X_test.iloc[i].mul(weights).sum()       # get sample's values
        pred = 1/(1 + pow(math.e, -lin_total))              # get prediction from sigmoid
        y_preds.append(round(pred))                         # round prediction and add to list of predictions
    
    # print all stats
    print_stats(y_test, y_preds)

In [None]:
weights, losses = fit_logistic_regression(200, 0.1, X, y)

In [48]:
predict_logistic_regression(weights, X_test, y_test)

Accuracy: 0.9641255605381166
Precision: 0.8666666666666667
Recall: 0.8881987577639752
F1: 0.8773006134969326
Confusion Matrix: [[932  22]
 [ 18 143]]


### plot loss function


### compare w/ sklearn stats

In [46]:
from sklearn.linear_model import LogisticRegression

sk_model = LogisticRegression()
sk_model.fit(X, y)
y_pred_sk = sk_model.predict(X_test)
print_stats(y_test, y_pred_sk)

Accuracy: 0.9820627802690582
Precision: 0.993006993006993
Recall: 0.8819875776397516
F1: 0.9342105263157895
Confusion Matrix: [[953   1]
 [ 19 142]]
