# Data Preparation:

In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [52]:
df = pd.read_csv('emails.csv')

In [53]:
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [54]:
df.shape

(5172, 3002)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [57]:
X = df.drop(columns=['Email No.', 'Prediction'])
y = df['Prediction']
X_numeric = X.select_dtypes(include=['float64', 'int64'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Model Training:

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=40)

In [59]:
logistic_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
SVM = SVC(kernel='linear', random_state=42)
random_forest = RandomForestClassifier()
gradient_boost = GradientBoostingClassifier()

In [60]:
models = [logistic_reg, decision_tree, SVM, random_forest, gradient_boost]
for model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")

LogisticRegression Accuracy: 0.9710
DecisionTreeClassifier Accuracy: 0.9343
SVC Accuracy: 0.9488
RandomForestClassifier Accuracy: 0.9720
GradientBoostingClassifier Accuracy: 0.9681


# Model Evaluation:

In [61]:
#Chosen Model
chosen_model = random_forest #Achieved an accuracy of 97.20%
evaluate_model(chosen_model, X_test, y_test)

Accuracy: 0.9719806763285024, Precision: 0.9461279461279462, Recall: 0.95578231292517, F1-score: 0.9509306260575295


In [62]:
evaluate_models = [logistic_reg, decision_tree, SVM, gradient_boosting]
for model in evaluate_models:
    print(f"{model.__class__.__name__} Evaluation:")
    evaluate_model(model, X_test, y_test)

LogisticRegression Evaluation:
Accuracy: 0.9710144927536232, Precision: 0.9429530201342282, Recall: 0.95578231292517, F1-score: 0.9493243243243243
DecisionTreeClassifier Evaluation:
Accuracy: 0.9342995169082126, Precision: 0.8621794871794872, Recall: 0.9149659863945578, F1-score: 0.8877887788778878
SVC Evaluation:
Accuracy: 0.9487922705314009, Precision: 0.9112627986348123, Recall: 0.9081632653061225, F1-score: 0.909710391822828
GradientBoostingClassifier Evaluation:
Accuracy: 0.9681159420289855, Precision: 0.9423728813559322, Recall: 0.9455782312925171, F1-score: 0.9439728353140917


# Cross-validation

In [63]:
cv_scores = cross_val_score(chosen_model, X_scaled, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Cross-validation scores: [0.96425121 0.96618357 0.95744681 0.97195358 0.94003868]
Mean CV accuracy: 0.9599747708350854


In [64]:
for model in models:
    model_name = model.__class__.__name__
    cv_scores = cross_val_score(model, X_scaled, y, cv=5)
    print(f"{model_name} - Cross-validation scores: {', '.join(map(str, cv_scores))}")
    print(f"{model_name} - Mean CV accuracy: {cv_scores.mean():.4f}")

LogisticRegression - Cross-validation scores: 0.9565217391304348, 0.9671497584541063, 0.9632495164410058, 0.960348162475822, 0.9497098646034816
LogisticRegression - Mean CV accuracy: 0.9594
DecisionTreeClassifier - Cross-validation scores: 0.923671497584541, 0.9265700483091788, 0.9226305609284333, 0.9294003868471954, 0.8849129593810445
DecisionTreeClassifier - Mean CV accuracy: 0.9174
SVC - Cross-validation scores: 0.9294685990338164, 0.9478260869565217, 0.9429400386847195, 0.937137330754352, 0.9148936170212766
SVC - Mean CV accuracy: 0.9345
RandomForestClassifier - Cross-validation scores: 0.9642512077294686, 0.9632850241545894, 0.9477756286266924, 0.9690522243713733, 0.9381044487427466
RandomForestClassifier - Mean CV accuracy: 0.9565
GradientBoostingClassifier - Cross-validation scores: 0.9536231884057971, 0.9632850241545894, 0.9555125725338491, 0.9671179883945842, 0.9390715667311412
GradientBoostingClassifier - Mean CV accuracy: 0.9557
