In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from preprocess_data import preprocess_data
import os
import seaborn as sns

In [3]:
working_dir = r"F:\EmailDetectionSpam\emails.csv"
file_name = "emails.csv"
data_dir = os.path.join(working_dir, file_name)
data = preprocess_data(data_dir)    # Preprocess the data



Loading data...
Cleaning texts...
Creating vocabulary...
Computing IDF values...
Converting texts to TF-IDF features...
Splitting into train/test sets...
Normalizing features...

Preprocessing complete!
Vocabulary size: 18534
Training set shape: (4583, 18534)
Testing set shape: (1145, 18534)


Load data in variable

In [4]:
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

Training logitics Regression

In [5]:
logistic_model = LogisticRegression(max_iter=1000, solver='lbfgs')
logistic_model.fit(X_train, y_train)


Training SVM

In [6]:
svm_model = LinearSVC(C=1.0, max_iter=10000)
svm_model.fit(X_train, y_train)

Predictions

In [25]:
y_pred_logistic = logistic_model.predict(X_test)
y_pred_svm = svm_model.predict(X_test)

Evaluate models

In [45]:
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_logistic),
    "Precision": precision_score(y_test, y_pred_logistic),
    "Recall": recall_score(y_test, y_pred_logistic),
    "F1 Score": f1_score(y_test, y_pred_logistic),
    "Confusion Matrix": confusion_matrix(y_test, y_pred_logistic)
}

metrics_svm = {
    "Accuracy": accuracy_score(y_test, y_pred_svm),
    "Precision": precision_score(y_test, y_pred_svm),
    "Recall": recall_score(y_test, y_pred_svm),
    "F1 Score": f1_score(y_test, y_pred_svm),
    "Confusion Matrix": confusion_matrix(y_test, y_pred_svm)
}

In [46]:
print("SVM")
for metric, value in metrics_svm.items():
    print(f"{metric}: {value}")

SVM
Accuracy: 0.97117903930131
Precision: 0.921311475409836
Recall: 0.9689655172413794
F1 Score: 0.9445378151260504
Confusion Matrix: [[831  24]
 [  9 281]]


In [42]:
print("Logistic Regression Metrics:\n")
for metric, value in metrics.items():
    print(f"{metric}: {value}")
    


Logistic Regression Metrics:

Accuracy: 0.97117903930131
Precision: 0.9776951672862454
Recall: 0.906896551724138
F1 Score: 0.9409660107334527
Confusion Matrix: [[849   6]
 [ 27 263]]


In [31]:
import os
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time
from preprocess_data import preprocess_data
from sklearn.ensemble import RandomForestClassifier





In [32]:
def train_rf(X_train, y_train, n_estimators=200, max_depth=None, min_samples_split=5, verbose=False):
    """Train an optimized Random Forest model for spam classification."""
    start_time = time.time()

    # Create a Random Forest classifier with optimized hyperparameters
    rf_clf = RandomForestClassifier(
        n_estimators=n_estimators,  # Number of trees (Increase for better accuracy)
        max_depth=max_depth,  # Limits depth of trees to prevent overfitting
        min_samples_split=min_samples_split,  # Minimum samples per split (Prevents overfitting)
        random_state=42,  # Reproducibility
        n_jobs=-1,  # Use all CPU cores for faster training
        verbose=1 if verbose else 0
    )

    # Train the model
    rf_clf.fit(X_train, y_train)

    print(f"\nRandom Forest Training completed in {time.time() - start_time:.2f} seconds")
    
    return rf_clf

In [33]:
def train_svm(X_train, y_train, max_iter=5000, C=1.0, alpha=0.00005,verbose=False):

    # Create and train SVM model
    start_time = time.time()
    
    # Linear SVM chosen for spam classification due to efficiency with high-dimensional, sparse text data
    # Provides good generalization while allowing interpretability of word importance weights
    svm_clf = SGDClassifier(loss='hinge',max_iter=max_iter, tol=1e-3, random_state=42,verbose=verbose)
    svm_clf.fit(X_train, y_train)
    
    # Calibrate model to output probabilities
    calibrated_model = CalibratedClassifierCV(svm_clf, cv=3)
    calibrated_model.fit(X_train, y_train)
    
    print(f"\nSVM Training completed in {time.time() - start_time:.2f} seconds")
    
    return calibrated_model

In [34]:

def evaluate_model(model, X_train, y_train, X_test, y_test, model_name='Model'):
        
        # Evaluate the model
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
       
        
        metrics = {
            'train_accuracy': accuracy_score(y_train, y_train_pred),
            'test_accuracy': accuracy_score(y_test, y_test_pred),
            'train_precision': precision_score(y_train, y_train_pred),
            'recall': recall_score(y_test, y_test_pred),
            'f1_score': f1_score(y_test, y_test_pred),
            'confusion_matrix': confusion_matrix(y_test, y_test_pred)
        }
        print(f"\n{model_name} Evaluation Results:")
        print(f"Training Accuracy: {metrics['train_accuracy']:.4f}")
        print(f"Testing Accuracy: {metrics['test_accuracy']:.4f}")
        print(f"Confusion Matrix:\n{metrics['confusion_matrix']}")
        
        return metrics

In [None]:
def run_models():
    
    working_dir = r"F:\EmailDetectionSpam\emails.csv"
    file_name = "emails.csv"
    data_dir = os.path.join(working_dir, file_name)
    
    print("-" * 50)
    print("SPAM CLASSIFICATION USING SCIKIT-LEARN SVM")
    print("-" * 50)
    
    # Preprocess the data
    print("\nPreprocessing the data...")
    start_time = time.time()
    preprocessed_data = preprocess_data(data_dir)
    preprocess_time = time.time() - start_time
    print(f"Preprocessing completed in {preprocess_time:.2f} seconds")
    
    X_train = preprocessed_data['X_train']
    X_test = preprocessed_data['X_test']
    y_train = preprocessed_data['y_train']
    y_test = preprocessed_data['y_test']
    vocabulary = preprocessed_data['vocabulary']
    
    print(f"\nDataset Information:")
    print(f"Number of features (vocabulary size): {len(vocabulary)}")
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Testing set: {X_test.shape[0]} samples")
    print(f"Spam ratio in training set: {np.mean(y_train):.2f}")
    
    # Train the SVM model
    print("\nTraining the SVM model...")
    svm_model = train_svm(X_train, y_train)

    # Train Random Forest model
    print("\nTraining the Random Forest model...")
    rf_model = train_rf(X_train, y_train)

    # Evaluate both models
    evaluate_model(svm_model, X_train, y_train, X_test, y_test, "SVM")
    evaluate_model(rf_model, X_train, y_train, X_test, y_test, "Random Forest")

   

 
    


In [36]:
if __name__ == "__main__":
    run_svm()

--------------------------------------------------
SPAM CLASSIFICATION USING SCIKIT-LEARN SVM
--------------------------------------------------

Preprocessing the data...
Loading data...
Cleaning texts...
Creating vocabulary...
Computing IDF values...
Converting texts to TF-IDF features...
Splitting into train/test sets...
Normalizing features...

Preprocessing complete!
Vocabulary size: 18534
Training set shape: (4583, 18534)
Testing set shape: (1145, 18534)
Preprocessing completed in 2.13 seconds

Dataset Information:
Number of features (vocabulary size): 18534
Training set: 4583 samples
Testing set: 1145 samples
Spam ratio in training set: 0.24

Training the SVM model...

SVM Training completed in 2.82 seconds

Evaluating the model...

Model Evaluation Results:
Training Accuracy: 0.9998
Testing Accuracy: 0.9493
Confusion Matrix:
[[842  13]
 [ 45 245]]

Evaluation Results:
Training Accuracy: 0.9998
Testing Accuracy: 0.9493
Confusion Matrix:
[[842  13]
 [ 45 245]]
