In [1]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
SEED = 1207

In [3]:
import pandas as pd

TRAIN_DATASET = pd.read_csv("train.csv")
VALIDATION_DATASET = pd.read_csv("validation.csv")
TEST_DATASET = pd.read_csv("test.csv")
NUMBER_OF_JOBS = 1

# Benchmarking multiple models
MODELS = {
    "Logistic Regression": LogisticRegression(random_state = SEED),
    "Random Forest": RandomForestClassifier(random_state = SEED),
    "Support Vector Machine": SVC(random_state = SEED),
}

In [4]:
def fit_model(train_data, vectorizer, model):
    """Fits a model on train data"""
    X_train = vectorizer.fit_transform(train_data["text"])
    y_train = train_data["spam"]
    model.fit(X_train, y_train)
    return model, vectorizer

def score_model(model, vectorizer, data):
    """Scores a model on given data"""
    X = vectorizer.transform(data["text"])
    y_true = data["spam"]
    y_pred = model.predict(X)
    return accuracy_score(y_true, y_pred), classification_report(y_true, y_pred), confusion_matrix(y_true, y_pred)

def cross_validate_model(model, vectorizer, data):
    """Validates the model using cross-validation"""
    X = vectorizer.transform(data["text"])
    y = data["spam"]
    return cross_val_score(model, X, y, cv = 5, scoring = "recall").mean()

def score_and_evaluate_model(model, vectorizer, data, data_type):
    """Scores and evaluates model"""
    accuracy, report, confusion_mat = score_model(model, vectorizer, data)
    print(f"{data_type} Accuracy: {round(100 * accuracy, 2)} %")
    print(f"{data_type} Classification Report:\n{report}")
    print(f"{data_type} Confusion Matrix:\n{confusion_mat}")
    return accuracy

def fine_tune_model(model, vectorizer, train_data, validation_data):
    """Fine-tunes the model based on train and validation"""
    combined_data = pd.concat([train_data, validation_data], axis = 0)
    param_grid = {}
    if isinstance(model, LogisticRegression):
        param_grid = {"penalty": ["l1", "l2"], "C": [0.001, 0.01, 0.1, 1, 10, 100], "solver": ["liblinear", "saga"], "max_iter": [100, 200, 300]}
    elif isinstance(model, RandomForestClassifier):
        param_grid = {"n_estimators": [100, 200], "criterion": ["gini", "entropy"], "max_depth": [None, 10, 20], "min_samples_split": [2, 5], "min_samples_leaf": [1, 2], "max_features": ["auto", "sqrt"], "bootstrap": [True, False]}
    elif isinstance(model, SVC):
        param_grid = {"C": [0.1, 1, 10], "kernel": ["linear", "poly", "rbf", "sigmoid"], "degree": [2, 3, 4], "gamma": ["scale", "auto"], "shrinking": [True, False]}
    grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = "recall", n_jobs = -1)
    X_combined = vectorizer.transform(combined_data["text"])
    y_combined = combined_data["spam"]
    grid_search.fit(X_combined, y_combined)
    return grid_search.best_estimator_

def train_and_benchmark_model(model, model_name, train_data, validation_data, test_data):
    """Trains and benchmarks model"""
    print(f"\nTraining and Evaluating {model_name}:\n")
    model, vectorizer = fit_model(train_data, TfidfVectorizer(), model)
    score_and_evaluate_model(model, vectorizer, train_data, "Train Data")
    print("=" * 60)
    score_and_evaluate_model(model, vectorizer, validation_data, "Validation Data")
    print("=" * 60)
    model = fine_tune_model(model, vectorizer, train_data, validation_data)
    return score_and_evaluate_model(model, vectorizer, test_data, "Test Data")


models = sorted(MODELS.items())


In [5]:
# Initialize the best model and accuracy
best_model, best_accuracy = None, 0

# Iterate over all models
for model_name, model in models:
    # Train and benchmark the model
    test_accuracy = train_and_benchmark_model(model, model_name, TRAIN_DATASET, VALIDATION_DATASET, TEST_DATASET)

    # Update the best model and accuracy if the current model is better
    if test_accuracy > best_accuracy:
        best_model, best_accuracy = model_name, test_accuracy

# Print the best model and its accuracy
print(f"\nBest Model: {best_model} with Test Accuracy: {round(100 * best_accuracy, 2)} %")



Training and Evaluating Logistic Regression:

Train Data Accuracy: 99.66 %
Train Data Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3328
           1       1.00      0.99      0.99      1075

    accuracy                           1.00      4403
   macro avg       1.00      0.99      1.00      4403
weighted avg       1.00      1.00      1.00      4403

Train Data Confusion Matrix:
[[3326    2]
 [  13 1062]]
Validation Data Accuracy: 98.36 %
Validation Data Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       412
           1       1.00      0.93      0.97       138

    accuracy                           0.98       550
   macro avg       0.99      0.97      0.98       550
weighted avg       0.98      0.98      0.98       550

Validation Data Confusion Matrix:
[[412   0]
 [  9 129]]
Test Data Accuracy: 98.91 %
Test Data Classificati

480 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
370 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\raini\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\raini\anaconda3\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\raini\anaconda3\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\raini\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

Test Data Accuracy: 98.19 %
Test Data Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       397
           1       0.99      0.94      0.97       154

    accuracy                           0.98       551
   macro avg       0.99      0.97      0.98       551
weighted avg       0.98      0.98      0.98       551

Test Data Confusion Matrix:
[[396   1]
 [  9 145]]

Training and Evaluating Support Vector Machine:

Train Data Accuracy: 100.0 %
Train Data Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3328
           1       1.00      1.00      1.00      1075

    accuracy                           1.00      4403
   macro avg       1.00      1.00      1.00      4403
weighted avg       1.00      1.00      1.00      4403

Train Data Confusion Matrix:
[[3328    0]
 [   0 1075]]
Validation Data Accuracy: 99.09 %
Validation Data Classification 

### Analysis


### In the pursuit of creating an effective spam classification model, I have strategically chosen to prioritize the metric of recall. This decision is rooted in my primary objective of maximizing the identification of spam emails. Recall, often referred to as sensitivity or the true positive rate, quantifies the proportion of actual positives that are correctly classified. In the context of spam detection, a high recall signifies that my model is proficient at correctly flagging a substantial majority of spam emails, thereby shielding users from undesired content.

### I consider this approach particularly vital in scenarios where the repercussions of overlooking a spam email (a false negative) outweigh those of erroneously marking a legitimate email as spam (a false positive). By optimizing for recall, I am fine-tuning my model to minimize the instances of spam emails that evade detection, thereby ensuring a more secure and clutter-free inbox experience for users.

### However, I am cognizant of the need to maintain a balance with precision to prevent an excessive number of legitimate emails from being inaccurately flagged as spam. This careful calibration between recall and precision is what I believe will drive the success of my spam classification model.