In [2]:
import numpy as np
from pathlib import Path
import os
import sklearn
from sklearn import feature_extraction
from sklearn import naive_bayes
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# 0 Transform data

## 0.1 Transform data into BOW model

In [5]:
# Create a directory path for each enron subdirectory in project1_datasets
dir_train_paths = {
    "enron1_path": [Path("project1_datasets/enron1_train/ham"),Path("project1_datasets/enron1_train/spam")],
    "enron2_path": [Path("project1_datasets/enron2_train/ham"),Path("project1_datasets/enron2_train/spam")],
    "enron4_path": [Path("project1_datasets/enron4_train/ham"),Path("project1_datasets/enron4_train/spam")]
}
dir_test_paths = {
    "enron1_path": [Path("project1_datasets/enron1_test/ham"),Path("project1_datasets/enron1_test/spam")],
    "enron2_path": [Path("project1_datasets/enron2_test/ham"),Path("project1_datasets/enron2_test/spam")],
    "enron4_path": [Path("project1_datasets/enron4_test/ham"),Path("project1_datasets/enron4_test/spam")]
}
dir_path = Path("project1_datasets/enron1_train/ham")

In [6]:
# Function that reads files in a directory, and turns its content to a string that is appeneded to a list
def create_texts(path_list):
    ham = False
    texts = []
    y = []
    count_map = {
        path_list[0]: 0,
        path_list[1]: 0
    }
    
    for dir_path in path_list:
        if not ham:
            ham = True
        else:
            ham = False
        for filename in os.listdir(dir_path):
             if filename.endswith(".txt"):
                file_path = os.path.join(dir_path, filename)
                with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                    count_map[dir_path] = count_map.get(dir_path, 0) + 1
                    texts.append(file.read())
                    if ham == True:
                        y.append(0)
                    else:
                        y.append(1)

    for k, v in count_map.items():
        print(f"In {k} there are {v} files")
    return texts, y

In [7]:
# Turn train txt files into array of strings, and get the correct classification of each email in the enron_train_y variables
enron1_train_texts, enron1_train_y = create_texts(dir_train_paths["enron1_path"])
enron2_train_texts, enron2_train_y = create_texts(dir_train_paths["enron2_path"])
enron4_train_texts, enron4_train_y = create_texts(dir_train_paths["enron4_path"])

In project1_datasets/enron1_train/ham there are 319 files
In project1_datasets/enron1_train/spam there are 131 files
In project1_datasets/enron2_train/ham there are 340 files
In project1_datasets/enron2_train/spam there are 123 files
In project1_datasets/enron4_train/ham there are 133 files
In project1_datasets/enron4_train/spam there are 402 files


In [8]:
# Turn test txt files into array of strings, and get the correct classification of each email in the enron_test_y variables
enron1_test_texts, enron1_test_y = create_texts(dir_test_paths["enron1_path"])
enron2_test_texts, enron2_test_y = create_texts(dir_test_paths["enron2_path"])
enron4_test_texts, enron4_test_y = create_texts(dir_test_paths["enron4_path"])

In project1_datasets/enron1_test/ham there are 307 files
In project1_datasets/enron1_test/spam there are 149 files
In project1_datasets/enron2_test/ham there are 348 files
In project1_datasets/enron2_test/spam there are 130 files
In project1_datasets/enron4_test/ham there are 152 files
In project1_datasets/enron4_test/spam there are 391 files


In [9]:
# Function that turns array of strings into BOW model
def create_BOW_model(texts, vocabulary=None, using_vocab=False):
    if using_vocab:
            vectorizer = feature_extraction.text.CountVectorizer(stop_words='english', strip_accents='ascii', vocabulary=vocabulary)
            X = vectorizer.fit_transform(texts)
            bow_model = X.toarray()
            return bow_model
    vectorizer = feature_extraction.text.CountVectorizer(stop_words='english', strip_accents='ascii', token_pattern=r'(?u)\b[A-Za-z]+\b')
    X = vectorizer.fit_transform(texts)
    bow_model = X.toarray()
    vocabulary = vectorizer.get_feature_names_out()
    return bow_model, vocabulary

In [10]:
# Turn train texts into BOW model
enron1_train_bow, enron1_train_vocab = create_BOW_model(enron1_train_texts)
enron2_train_bow, enron2_train_vocab = create_BOW_model(enron2_train_texts)
enron4_train_bow, enron4_train_vocab = create_BOW_model(enron4_train_texts)

In [11]:
# Turn test texts into BOW model
enron1_test_bow = create_BOW_model(enron1_test_texts, enron1_train_vocab, using_vocab=True)
enron2_test_bow = create_BOW_model(enron2_test_texts, enron2_train_vocab, using_vocab=True)
enron4_test_bow = create_BOW_model(enron4_test_texts, enron4_train_vocab, using_vocab=True)

In [12]:
print(len(enron1_test_bow[0]), len(enron1_train_bow[0]))
print(len(enron2_test_bow[0]), len(enron2_train_bow[0]))
print(len(enron4_test_bow[0]), len(enron4_train_bow[0]))

8654 8654
8925 8925
16334 16334


## 0.2 Transform data into Bernoulli Model

In [14]:
# Takes array and for every value greater than 0, replaces with a 1
enron1_train_bn = (enron1_train_bow > 0).astype(int)

In [15]:
# Checking to see if it is working as it should
for i in range(int(len(enron1_train_bn[0]) / 5)):
    if enron1_train_bow[0][i] > 0:
        print(enron1_train_bow[0][i])
        print(enron1_train_bn[0][i])
        print('\n')

1
1


1
1


1
1


2
1


4
1


1
1


1
1




In [16]:
enron2_train_bn = (enron2_train_bow > 0).astype(int)
enron4_train_bn = (enron4_train_bow > 0).astype(int)
enron1_test_bn = (enron1_test_bow > 0).astype(int)
enron2_test_bn = (enron2_test_bow > 0).astype(int)
enron4_test_bn = (enron4_test_bow > 0).astype(int)

In [17]:
# Function that shuffle elements in multiple arrays around in the exact same way
def shuffle(array1, array2, array3):
    rng = np.random.default_rng()
    state = rng.bit_generator.state
    rng.shuffle(array1)
    rng.bit_generator.state = state
    rng.shuffle(array2)
    rng.bit_generator.state = state
    rng.shuffle(array3)

In [18]:
# Shuffle around elements in train set, test set, and correct output
shuffle(enron1_train_bow, enron1_train_bn, enron1_train_y)
shuffle(enron2_train_bow, enron2_train_bn, enron2_train_y)
shuffle(enron4_train_bow, enron4_train_bn, enron4_train_y)
shuffle(enron1_test_bow, enron1_test_bn, enron1_test_y)
shuffle(enron2_test_bow, enron2_test_bn, enron2_test_y)
shuffle(enron4_test_bow, enron4_test_bn, enron4_test_y)

# 1. Multinomial Naive Bayes algorithm

## 1.1 Running the algorithm on enron1

In [21]:
# Initialize, train, and predict using enron1

# Initialize multinomial Naive Bayes model using sklearn, and set alpha=1.0 for laplace smoothing
enron1_multinomialNB = naive_bayes.MultinomialNB(alpha=1.0)

# Train model by passing training data and expected values for each email example
enron1_multinomialNB.fit(enron1_train_bow, enron1_train_y)

# Test model on test data
enron1_y_preds = enron1_multinomialNB.predict(enron1_test_bow)

# Get metrics using sklearn.metrics
MNB_acc_1 = accuracy_score(enron1_test_y, enron1_y_preds)
MNB_pre_1 = precision_score(enron1_test_y, enron1_y_preds, average=None)
MNB_rec_1 = recall_score(enron1_test_y, enron1_y_preds, average=None)
MNB_f1_1 = f1_score(enron1_test_y, enron1_y_preds, average=None)

print(f"The accuracy score is: {MNB_acc_1:.3f}")
print(f"The precision score for class 0 is: {MNB_pre_1[0]:.3f}, for class 1 is: {MNB_pre_1[1]:.3f}")
print(f"The recall score for class 0 is: {MNB_rec_1[0]:.3f}, for class 1 is: {MNB_rec_1[1]:.3f}")
print(f"The f1 score for class 0 is: {MNB_f1_1[0]:.3f}, for class 1 is: {MNB_f1_1[1]:.3f}")

The accuracy score is: 0.928
The precision score for class 0 is: 0.931, for class 1 is: 0.920
The recall score for class 0 is: 0.964, for class 1 is: 0.852
The f1 score for class 0 is: 0.947, for class 1 is: 0.885


## 1.2 Running the algorithm on enron2

In [23]:
# Initialize, train, and predict using enron2
enron2_multinomialNB = naive_bayes.MultinomialNB(alpha=1.0)
enron2_multinomialNB.fit(enron2_train_bow, enron2_train_y)
enron2_y_preds = enron2_multinomialNB.predict(enron2_test_bow)

MNB_acc_2 = accuracy_score(enron2_test_y, enron2_y_preds)
MNB_pre_2 = precision_score(enron2_test_y, enron2_y_preds, average=None)
MNB_rec_2 = recall_score(enron2_test_y, enron2_y_preds, average=None)
MNB_f1_2 = f1_score(enron2_test_y, enron2_y_preds, average=None)

print(f"The accuracy score is: {MNB_acc_2:.3f}")
print(f"The precision score for class 0 is: {MNB_pre_2[0]:.3f}, for class 1 is: {MNB_pre_2[1]:.3f}")
print(f"The recall score for class 0 is: {MNB_rec_2[0]:.3f}, for class 1 is: {MNB_rec_2[1]:.3f}")
print(f"The f1 score for class 0 is: {MNB_f1_2[0]:.3f}, for class 1 is: {MNB_f1_2[1]:.3f}")

The accuracy score is: 0.937
The precision score for class 0 is: 0.947, for class 1 is: 0.910
The recall score for class 0 is: 0.968, for class 1 is: 0.854
The f1 score for class 0 is: 0.957, for class 1 is: 0.881


## 1.3 Running the algorithm on enron4

In [25]:
# Initialize, train, and predict using enron4
enron4_multinomialNB = naive_bayes.MultinomialNB(alpha=1.0)
enron4_multinomialNB.fit(enron4_train_bow, enron4_train_y)
enron4_y_preds = enron4_multinomialNB.predict(enron4_test_bow)

MNB_acc_4 = accuracy_score(enron4_test_y, enron4_y_preds)
MNB_pre_4 = precision_score(enron4_test_y, enron4_y_preds, average=None)
MNB_rec_4 = recall_score(enron4_test_y, enron4_y_preds, average=None)
MNB_f1_4 = f1_score(enron4_test_y, enron4_y_preds, average=None)

print(f"The accuracy score is: {MNB_acc_4:.3f}")
print(f"The precision score for class 0 is: {MNB_pre_4[0]:.3f}, for class 1 is: {MNB_pre_4[1]:.3f}")
print(f"The recall score for class 0 is: {MNB_rec_4[0]:.3f}, for class 1 is: {MNB_rec_4[1]:.3f}")
print(f"The f1 score for class 0 is: {MNB_f1_4[0]:.3f}, for class 1 is: {MNB_f1_4[1]:.3f}")

The accuracy score is: 0.971
The precision score for class 0 is: 0.959, for class 1 is: 0.975
The recall score for class 0 is: 0.934, for class 1 is: 0.985
The f1 score for class 0 is: 0.947, for class 1 is: 0.980


# 2. Discrete Naive Bayes algorithm

## 2.1 Running the algorithm on enron1

In [28]:
enron1_discreteNB = naive_bayes.BernoulliNB(alpha=1.0)
enron1_discreteNB.fit(enron1_train_bn, enron1_train_y)
enron1_y_preds = enron1_discreteNB.predict(enron1_test_bn)

DNB_acc_1 = accuracy_score(enron1_test_y, enron1_y_preds)
DNB_pre_1 = precision_score(enron1_test_y, enron1_y_preds, average=None)
DNB_rec_1 = recall_score(enron1_test_y, enron1_y_preds, average=None)
DNB_f1_1 = f1_score(enron1_test_y, enron1_y_preds, average=None)

print(f"The accuracy score is: {DNB_acc_1:.3f}")
print(f"The precision score for class 0 is: {MNB_pre_1[0]:.3f}, for class 1 is: {DNB_pre_1[1]:.3f}")
print(f"The recall score for class 0 is: {DNB_rec_1[0]:.3f}, for class 1 is: {DNB_rec_1[1]:.3f}")
print(f"The f1 score for class 0 is: {DNB_f1_1[0]:.3f}, for class 1 is: {DNB_f1_1[1]:.3f}")

The accuracy score is: 0.730
The precision score for class 0 is: 0.931, for class 1 is: 0.906
The recall score for class 0 is: 0.990, for class 1 is: 0.195
The f1 score for class 0 is: 0.832, for class 1 is: 0.320


## 2.2 Running the algorithm on enron2

In [30]:
enron2_discreteNB = naive_bayes.BernoulliNB(alpha=1.0)
enron2_discreteNB.fit(enron2_train_bn, enron2_train_y)
enron2_y_preds = enron2_discreteNB.predict(enron2_test_bn)

DNB_acc_2 = accuracy_score(enron2_test_y, enron2_y_preds)
DNB_pre_2 = precision_score(enron2_test_y, enron2_y_preds, average=None)
DNB_rec_2 = recall_score(enron2_test_y, enron2_y_preds, average=None)
DNB_f1_2 = f1_score(enron2_test_y, enron2_y_preds, average=None)

print(f"The accuracy score is: {DNB_acc_2:.3f}")
print(f"The precision score for class 0 is: {DNB_pre_2[0]:.3f}, for class 1 is: {DNB_pre_2[1]:.3f}")
print(f"The recall score for class 0 is: {MNB_rec_2[0]:.3f}, for class 1 is: {DNB_rec_2[1]:.3f}")
print(f"The f1 score for class 0 is: {DNB_f1_2[0]:.3f}, for class 1 is: {DNB_f1_2[1]:.3f}")

The accuracy score is: 0.774
The precision score for class 0 is: 0.767, for class 1 is: 0.893
The recall score for class 0 is: 0.968, for class 1 is: 0.192
The f1 score for class 0 is: 0.865, for class 1 is: 0.316


## 2.3 Running the algorithm on enron4

In [32]:
enron4_discreteNB = naive_bayes.BernoulliNB(alpha=1.0)
enron4_discreteNB.fit(enron4_train_bn, enron4_train_y)
enron4_y_preds = enron4_discreteNB.predict(enron4_test_bn)

DNB_acc_4 = accuracy_score(enron4_test_y, enron4_y_preds)
DNB_pre_4 = precision_score(enron4_test_y, enron4_y_preds, average=None)
DNB_rec_4 = recall_score(enron4_test_y, enron4_y_preds, average=None)
DNB_f1_4 = f1_score(enron4_test_y, enron4_y_preds, average=None)

print(f"The accuracy score is: {DNB_acc_4:.3f}")
print(f"The precision score for class 0 is: {DNB_pre_4[0]:.3f}, for class 1 is: {DNB_pre_4[1]:.3f}")
print(f"The recall score for class 0 is: {DNB_rec_4[0]:.3f}, for class 1 is: {DNB_rec_4[1]:.3f}")
print(f"The f1 score for class 0 is: {DNB_f1_4[0]:.3f}, for class 1 is: {DNB_f1_4[1]:.3f}")

The accuracy score is: 0.917
The precision score for class 0 is: 1.000, for class 1 is: 0.897
The recall score for class 0 is: 0.704, for class 1 is: 1.000
The f1 score for class 0 is: 0.826, for class 1 is: 0.946


# 3. MCAP Logistic Regression algorithm with L2 Regularization

## 3.1 Implementation of the MCAP Logistic Regression with L2 Regularization

In [119]:
from scipy.special import expit

# randomly initialize weights
def initialize_weights(n_features):
    return np.random.randn(n_features, 1) * 0.01

def compute_cost(X, y, weights, lambda_reg):
    m = X.shape[0] # number of samples
    h = expit(X.dot(weights))
    epsilon = 1e-15 # added this because I was getting divide by zero errors when running this algorithm on enron4
    cost = (-1/m) * (y.T.dot(np.log(h)) + (1-y).T.dot(np.log(1-h + epsilon)))
    reg_cost = (lambda_reg / (2 * m)) * np.sum(np.square(weights[1:])) # skip bias term
    return cost + reg_cost

def compute_gradient(X, y, weights, lamda_reg):
    m = X.shape[0]
    h = expit(X.dot(weights))
    gradient = (1 / m) * X.T.dot(h - y)

    gradient[1:] += (lamda_reg / m) * weights[1:]
    return gradient

def gradient_descent(X, y, weights, learning_rate, lamda_reg, num_iterations):
    cost_history = []

    for i in range(num_iterations):
        weights -= learning_rate * compute_gradient(X, y, weights, lambda_reg)
        cost = compute_cost(X, y, weights, lamda_reg)
        cost_history.append(cost[0][0])

    return weights, cost_history

# threshold at 0.5 for classification
def predict(X, weights):
    return expit(X.dot(weights)) >= 0.5

def logistic_regression(X_train, y_train, learning_rate, lambda_reg, num_iterations):
    n_samples, n_features = X_train.shape
    weights = initialize_weights(n_features)
    weights, cost_history = gradient_descent(X_train, y_train, weights, learning_rate, lambda_reg, num_iterations)
    return weights, cost_history

In [36]:
from sklearn.model_selection import train_test_split
# Function to create train and test split

def train_test(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
    return X_train, X_test, y_train, y_test

## Running the algorithm on enron1 BOW

In [38]:
# Split enron1 BOW dataset

X = enron1_train_bow
y = enron1_train_y

X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test(X, y)

In [69]:
# Function to run logistic regression

def run_logistic_regression(X_train, y_train, learning_rate, lambda_reg, num_iterations, X_test, y_test):
    y_train_LR = np.array(y_train).reshape(-1, 1)

    weights, cost_history = logistic_regression(X_train, y_train_LR, learning_rate, lambda_reg, num_iterations)
    preds = predict(X_test, weights)
    print(f"Hyperparamters: learning_rate: {learning_rate}, lambda:{lambda_reg}, epochs:{num_iterations} \nAccuracy: {accuracy_score(y_test, preds):.2f}")

In [40]:
# Run Logistic regression with different lambda values

learning_rate = 0.01
num_iterations = 500

lambda_reg = 0.1
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

print("\n")

lambda_reg = 0.05
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

print("\n")

lambda_reg = 0.01
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

Hyperparamters: learning_rate: 0.01, lambda:0.1, epochs:500 
Accuracy: 0.81


Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.83


Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.82


In [71]:
# Selecting lambda = 0.05

learning_rate = 0.01
num_iterations = 500
lambda_reg = 0.05
run_logistic_regression(enron1_train_bow, enron1_train_y, learning_rate, lambda_reg, num_iterations, enron1_test_bow, enron1_test_y)

Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.85


## Running the algorithm on enron1 BN

In [78]:
X = enron1_train_bn
y = enron1_train_y

X_train_bn, X_test_bn, y_train_bn, y_test_bn = train_test(X, y)

In [80]:
learning_rate = 0.01
num_iterations = 500

lambda_reg = 0.1
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

print("\n")

lambda_reg = 0.05
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

print("\n")

lambda_reg = 0.01
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

Hyperparamters: learning_rate: 0.01, lambda:0.1, epochs:500 
Accuracy: 0.84


Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.87


Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.84


In [82]:
# Selecting lambda = 0.5
learning_rate = 0.01
num_iterations = 500
lambda_reg = 0.05
run_logistic_regression(enron1_train_bn, enron1_train_y, learning_rate, lambda_reg, num_iterations, enron1_test_bn, enron1_test_y)

Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.83


## Running the algorithm on enron2 BOW

In [96]:
X = enron2_train_bow
y = enron2_train_y

X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test(X, y)

In [98]:
# Run Logistic regression with different lambda values

learning_rate = 0.01
num_iterations = 500

lambda_reg = 0.1
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

print("\n")

lambda_reg = 0.05
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

print("\n")

lambda_reg = 0.01
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

Hyperparamters: learning_rate: 0.01, lambda:0.1, epochs:500 
Accuracy: 0.91


Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.92


Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.91


In [100]:
# Selecting lambda = 0.05
learning_rate = 0.01
num_iterations = 500
lambda_reg = 0.05
run_logistic_regression(enron2_train_bow, enron2_train_y, learning_rate, lambda_reg, num_iterations, enron2_test_bow, enron2_test_y)

Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.85


## Running the algorithm on enron2 BN

In [104]:
X = enron2_train_bn
y = enron2_train_y

X_train_bn, X_test_bn, y_train_bn, y_test_bn = train_test(X, y)

In [106]:
learning_rate = 0.01
num_iterations = 500

lambda_reg = 0.1
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

print("\n")

lambda_reg = 0.05
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

print("\n")

lambda_reg = 0.01
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

Hyperparamters: learning_rate: 0.01, lambda:0.1, epochs:500 
Accuracy: 0.81


Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.83


Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.81


In [108]:
# Selecting lambda = 0.5

learning_rate = 0.01
num_iterations = 500
lambda_reg = 0.05
run_logistic_regression(enron2_train_bn, enron2_train_y, learning_rate, lambda_reg, num_iterations, enron2_test_bn, enron2_test_y)

Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.83


## Running the algorithm on enron4 BOW

In [111]:
X = enron4_train_bow
y = enron4_train_y

X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test(X, y)

In [117]:
# Run Logistic regression with different lambda values

learning_rate = 0.01
num_iterations = 500

lambda_reg = 0.1
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

print("\n")

lambda_reg = 0.05
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

print("\n")

lambda_reg = 0.01
run_logistic_regression(X_train_bow, y_train_bow, learning_rate, lambda_reg, num_iterations, X_test_bow, y_test_bow)

Hyperparamters: learning_rate: 0.01, lambda:0.1, epochs:500 
Accuracy: 0.93


Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.92


Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.93


In [123]:
# Selecting lambda = 0.01

learning_rate = 0.01
num_iterations = 500
lambda_reg = 0.01
run_logistic_regression(enron4_train_bow, enron4_train_y, learning_rate, lambda_reg, num_iterations, enron4_test_bow, enron4_test_y)

Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.94


## Running the algorithm on enron4 BN

In [127]:
X = enron2_train_bn
y = enron2_train_y

X_train_bn, X_test_bn, y_train_bn, y_test_bn = train_test(X, y)

In [129]:
learning_rate = 0.01
num_iterations = 500

lambda_reg = 0.1
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

print("\n")

lambda_reg = 0.05
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

print("\n")

lambda_reg = 0.01
run_logistic_regression(X_train_bn, y_train_bn, learning_rate, lambda_reg, num_iterations, X_test_bn, y_test_bn)

Hyperparamters: learning_rate: 0.01, lambda:0.1, epochs:500 
Accuracy: 0.86


Hyperparamters: learning_rate: 0.01, lambda:0.05, epochs:500 
Accuracy: 0.86


Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.87


In [131]:
# Selecting lambda = 0.01

learning_rate = 0.01
num_iterations = 500
lambda_reg = 0.01
run_logistic_regression(enron4_train_bn, enron4_train_y, learning_rate, lambda_reg, num_iterations, enron4_test_bn, enron4_test_y)

Hyperparamters: learning_rate: 0.01, lambda:0.01, epochs:500 
Accuracy: 0.91


# 4. SGDClassifier algorithm with GridSearchCV

In [152]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

# Initialize SGDClassifier and GridSearchCV

sgd = SGDClassifier()

param_grid = {
    'loss': ['log_loss', 'hinge', 'modified_huber'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [1e-5, 1e-4, 1e-3],
    'learning_rate': ['optimal', 'adaptive'],
    'eta0': [0.1, 0.01, 0.001],
    'max_iter': [500],
    'tol': [1e-3]
}

grid_search = GridSearchCV(
    estimator=sgd,
    param_grid=param_grid,
    scoring='accuracy',
    n_jobs= -1
)

In [177]:
# Function that runs SGD on data in params
def run_SGD_algorithm(X_train, y_train, X_test, y_test):
    grid_search.fit(X_train, y_train)
    print(f"Best params found: {grid_search.best_params_}")
    print("\n")
    best_sgd = grid_search.best_estimator_
    y_pred = best_sgd.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy:.2f}")
    print(f"Classification report:\n{classification_report(y_test, y_pred)}")

## Running algorithm on enron1 BOW

In [175]:
run_SGD_algorithm(enron1_train_bow, enron1_train_y, enron1_test_bow, enron1_test_y)

Best params found: {'alpha': 0.0001, 'eta0': 0.001, 'learning_rate': 'adaptive', 'loss': 'modified_huber', 'max_iter': 500, 'penalty': 'l2', 'tol': 0.001}


Model accuracy: 0.96
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       307
           1       0.90      0.97      0.94       149

    accuracy                           0.96       456
   macro avg       0.94      0.96      0.95       456
weighted avg       0.96      0.96      0.96       456



## Running algorithm on enron1 BN

In [180]:
run_SGD_algorithm(enron1_train_bn, enron1_train_y, enron1_test_bn, enron1_test_y)

Best params found: {'alpha': 1e-05, 'eta0': 0.1, 'learning_rate': 'adaptive', 'loss': 'log_loss', 'max_iter': 500, 'penalty': 'l2', 'tol': 0.001}


Model accuracy: 0.96
Classification report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       307
           1       0.92      0.96      0.94       149

    accuracy                           0.96       456
   macro avg       0.95      0.96      0.96       456
weighted avg       0.96      0.96      0.96       456



## Running algorithm on enron2 BOW

In [183]:
run_SGD_algorithm(enron2_train_bow, enron2_train_y, enron2_test_bow, enron2_test_y)

Best params found: {'alpha': 1e-05, 'eta0': 0.1, 'learning_rate': 'adaptive', 'loss': 'log_loss', 'max_iter': 500, 'penalty': 'l2', 'tol': 0.001}


Model accuracy: 0.94
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       348
           1       0.92      0.87      0.89       130

    accuracy                           0.94       478
   macro avg       0.94      0.92      0.93       478
weighted avg       0.94      0.94      0.94       478



## Running algorithm on enron2 BN

In [188]:
run_SGD_algorithm(enron2_train_bn, enron2_train_y, enron2_test_bn, enron2_test_y)

Best params found: {'alpha': 0.001, 'eta0': 0.001, 'learning_rate': 'optimal', 'loss': 'log_loss', 'max_iter': 500, 'penalty': 'elasticnet', 'tol': 0.001}


Model accuracy: 0.95
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       348
           1       0.91      0.89      0.90       130

    accuracy                           0.95       478
   macro avg       0.94      0.93      0.93       478
weighted avg       0.95      0.95      0.95       478



## Running algorithm on enron4 BOW

In [191]:
run_SGD_algorithm(enron4_train_bow, enron4_train_y, enron4_test_bow, enron4_test_y)

Best params found: {'alpha': 1e-05, 'eta0': 0.001, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'l1', 'tol': 0.001}


Model accuracy: 0.97
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94       152
           1       0.97      0.98      0.98       391

    accuracy                           0.97       543
   macro avg       0.97      0.96      0.96       543
weighted avg       0.97      0.97      0.97       543



## Running algorithm on enron4 BN

In [194]:
run_SGD_algorithm(enron4_train_bn, enron4_train_y, enron4_test_bn, enron4_test_y)

Best params found: {'alpha': 1e-05, 'eta0': 0.01, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 500, 'penalty': 'elasticnet', 'tol': 0.001}


Model accuracy: 0.97
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93       152
           1       0.95      1.00      0.98       391

    accuracy                           0.97       543
   macro avg       0.98      0.94      0.95       543
weighted avg       0.97      0.97      0.96       543

