# Data Preparation:

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('emails.csv')

In [3]:
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [4]:
df.shape

(5172, 3002)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [6]:
X = df.drop(columns=['Email No.', 'Prediction'])
y = df['Prediction']
X_numeric = X.select_dtypes(include=['float64', 'int64'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Model Training:

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=40)

In [8]:
logistic_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
SVM = SVC(kernel='linear', random_state=42)
random_forest = RandomForestClassifier()
gradient_boost = GradientBoostingClassifier()

In [9]:
models = [logistic_reg, decision_tree, SVM, random_forest, gradient_boost]
for model in models:
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    print(f"{model.__class__.__name__} Accuracy: {accuracy:.4f}")

LogisticRegression Accuracy: 0.9710
DecisionTreeClassifier Accuracy: 0.9391
SVC Accuracy: 0.9488
RandomForestClassifier Accuracy: 0.9710
GradientBoostingClassifier Accuracy: 0.9652


# Model Evaluation:

In [10]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

In [11]:
#Chosen Model
chosen_model = random_forest #Achieved an accuracy of 97.20%
evaluate_model(chosen_model, X_test, y_test)

Accuracy: 0.9710
Precision: 0.9459
Recall: 0.9524
F1-score: 0.9492


In [12]:
evaluate_models = [logistic_reg, decision_tree, SVM, gradient_boost]
for model in evaluate_models:
    print(f"{model.__class__.__name__} Evaluation:")
    evaluate_model(model, X_test, y_test)

LogisticRegression Evaluation:
Accuracy: 0.9710
Precision: 0.9430
Recall: 0.9558
F1-score: 0.9493
DecisionTreeClassifier Evaluation:
Accuracy: 0.9391
Precision: 0.8690
Recall: 0.9252
F1-score: 0.8962
SVC Evaluation:
Accuracy: 0.9488
Precision: 0.9113
Recall: 0.9082
F1-score: 0.9097
GradientBoostingClassifier Evaluation:
Accuracy: 0.9652
Precision: 0.9358
Recall: 0.9422
F1-score: 0.9390


# Cross-validation

In [13]:
cv_scores = cross_val_score(chosen_model, X_scaled, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Cross-validation scores: [0.96135266 0.96908213 0.95454545 0.9622824  0.94680851]
Mean CV accuracy: 0.9588142292490118


In [14]:
for model in models:
    model_name = model.__class__.__name__
    cv_scores = cross_val_score(model, X_scaled, y, cv=5)
    print(f"{model_name} - Cross-validation scores: {', '.join(map(str, cv_scores))}")
    print(f"{model_name} - Mean CV accuracy: {cv_scores.mean():.4f}")

LogisticRegression - Cross-validation scores: 0.9565217391304348, 0.9671497584541063, 0.9632495164410058, 0.960348162475822, 0.9497098646034816
LogisticRegression - Mean CV accuracy: 0.9594
DecisionTreeClassifier - Cross-validation scores: 0.9227053140096618, 0.9217391304347826, 0.9187620889748549, 0.9235976789168279, 0.8800773694390716
DecisionTreeClassifier - Mean CV accuracy: 0.9134
SVC - Cross-validation scores: 0.9294685990338164, 0.9478260869565217, 0.9429400386847195, 0.937137330754352, 0.9148936170212766
SVC - Mean CV accuracy: 0.9345
RandomForestClassifier - Cross-validation scores: 0.9642512077294686, 0.9652173913043478, 0.9487427466150871, 0.965183752417795, 0.9361702127659575
RandomForestClassifier - Mean CV accuracy: 0.9559
GradientBoostingClassifier - Cross-validation scores: 0.9536231884057971, 0.9632850241545894, 0.9555125725338491, 0.9671179883945842, 0.9400386847195358
GradientBoostingClassifier - Mean CV accuracy: 0.9559


# Hyperparameter Tuning

In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
modelss = [
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    SVC(kernel='linear', random_state=42)
]

In [18]:
for model in modelss:
    param_grid = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [1, 5, 10, 20]
    }

In [20]:
classifier = RandomizedSearchCV(model, param_distributions=param_grid, cv=5)
classifier.fit(X_train, y_train)

In [22]:
print(f"Model: {model.__class__.__name__}")
print("Best parameters:", classifier.best_params_)
print("Highest accuracy:", classifier.best_score_)
print("\n")

Model: SVC
Best parameters: {'kernel': 'rbf', 'C': 20}
Highest accuracy: 0.9528684670159882


