# Logistic Regression for Spam Classification

In [None]:
import pandas as pd
import numpy as np
import math
import sklearn.metrics as skm
import plotly.express as px
import seaborn as sns 

## get data + set up model

In [None]:
# import in data
test = pd.read_csv("test-1.csv")
train = pd.read_csv("train-1.csv")

# split into output and input
y = train['label']                  # y_train
X = train.drop(['label'], axis = 1) # X_train

y_test = test['label']
X_test = test.drop(['label'], axis = 1)

## fit_logistic_regression: 
fits the model w/ func sigmoid and by updating weights w/ stochastic gradient descent <br>
&emsp;<b> inputs </b>: epochs (# of epochs), learning_rate (learning rate/eta), X (input features), y (target feature)<br>
&emsp;<b> output/return </b>: weights of all features

In [None]:
# epsilon for calculating cross-entropy loss
EPSILON = 0.00000000000000000000000000001

def fit_logistic_regression (epochs, learning_rate, X, y):
    # initialize values
    weights = [0] * (len(X.columns)) # vector of 0s
    num_samples = X.shape[0]
    losses = []

    for epoch in range(0, int(epochs)): # run set number of epochs
        epoch_loss = 0
        for i in range(0, num_samples):  # go through each sample in X
            y_true = y[i]
            x_values = X.iloc[i]
            y_lin_pred = np.dot(weights, x_values)              # get linear portion of logistic regression
            y_pred = 1/(1 + pow(math.e, -np.sum(y_lin_pred)))   # calculate final prediction
            deriv = x_values * (y_pred - y_true)                # deriv of loss: formula calculated by hand! see md
            weights = weights - learning_rate * deriv
            # calculate binary cross-entropy loss bc it is a classification problem w/ binary outputs
            epoch_loss = epoch_loss + (-(y_true * math.log(y_pred + EPSILON, math.e) + (1-y_true) * math.log((1-y_pred) + EPSILON, math.e)))
        # loss for the final plot
        loss_point = epoch_loss/num_samples
        losses.append(loss_point)
        # print info about each epoch
        print("Epoch " + str(epoch) + ": cross-entroy loss = " + str(loss_point))
    
    # loss plot
    epoch_num = np.arange(1, len(losses) + 1)
    df_plot = pd.DataFrame({'Epoch': epoch_num, 'Loss': losses})

    loss_plot = px.line(df_plot, x = 'Epoch', y = 'Loss', title = 'Loss over Epochs')
    loss_plot.show()
    
    return weights

In [None]:
# fit model to get weights!
weights = fit_logistic_regression(200, 0.1, X, y)

## predict_logistic_regression:
<b> print_stats </b>: prints accuracy, precision, recall, f1, and confusion mtx based on given y_test + predictions <br>
<b>predict_logistic_regression</b>: predicts X test classification based on given weights <br>
&emsp;<b>inputs</b>: weights (from fit function), X_test, y_test <br>
&emsp;<b>outputs</b>: nothing (but prints stats)


In [None]:
def print_stats(y_test, y_preds):
    print("Accuracy: " + str(skm.accuracy_score(y_test, y_preds)))
    print("Precision: " + str(skm.precision_score(y_test, y_preds)))
    print("Recall: "+ str(skm.recall_score(y_test, y_preds)))
    print("F1: " + str(skm.f1_score(y_test, y_preds)))
    print("Confusion Matrix: " + str(skm.confusion_matrix(y_test, y_preds)))

In [None]:
def predict_logistic_regression (weights, X_test):
    num_samples = X_test.shape[0]
    y_preds = []
    for i in range(0, num_samples):                 # loop through all samples
        lin_total = X_test.iloc[i].mul(weights).sum()       # get sample's values
        pred = 1/(1 + pow(math.e, -lin_total))              # get prediction from sigmoid
        y_preds.append(round(pred + EPSILON))               # round prediction and add to list of predictions
        ## ^ note: add EPSILON bc round function rounds 0.5 down to 0, but it's supposed to round up
    
    return y_preds

In [None]:
# gets predictions + prints out stats (accuracy)
predictions = predict_logistic_regression(weights, X_test)
# print all stats
print_stats(y_test, predictions)

### compare w/ sklearn stats

In [None]:
from sklearn.linear_model import LogisticRegression

sk_model = LogisticRegression()
sk_model.fit(X, y)
y_pred_sk = sk_model.predict(X_test)
print_stats(y_test, y_pred_sk)

## observations + analyses

#### 1. data set

There seems to be significantly more ham samples than spam samples, likely biasing the model. The model would likely be better at discerning ham and spam if it was fed a more balanced data set. 

In [None]:
print("percent spam in training set: " + str(y.mean()))
print("percent spam in test set: " + str(y_test.mean()))

Additionally, there are many features in the data set (1364): likely, not all of them were important in the classification of ham/spam. 

#### 2. model + results

The model is likely overfit, since the accuracy decreases as epochs increase too much: <br>
200 epochs: Accuracy: 0.9641255605381166 <br>
100 epochs: Accuracy: 0.9650224215246637<br>
50 epochs: Accuracy: 0.9713004484304932 <br>

The loss also barely decreases towards the end, as seen in the loss plot

Also, it seems like the model is much more likely to mistakenly classify a spam message as ham than mistakenly classify a message that's ham as spam. We see this from the confusion matrix, as there are significantly more false negatives than false positives, even though there were less spam datapoints. <br>

In [None]:
print("Confusion Matrix: " + str(skm.confusion_matrix(y_test, predictions)))

This disparity likely stems from the inbalanced data set (mentioned above)

#### 3. possible improvements

Implementing an early stop mechanism could improve the results by preventing extreme overfitting. Additionally, eliminating noise features would increase model performance.