In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ===========================
# 1️⃣ Load dataset
# ===========================
df = pd.read_csv("emails.csv")   # ensure it's in same folder

# ===========================
# 2️⃣ Feature selection
# ===========================
X = df.iloc[:, 1:-1]   # features (exclude first & last column)
Y = df.iloc[:, -1]     # target labels (Spam = 1, Not Spam = 0)

# ===========================
# 3️⃣ Data Cleaning
# ===========================
# Replace infinities & remove NaN
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(0)  # replace missing values with 0

# ===========================
# 4️⃣ Feature Scaling (important for KNN & SVM)
# ===========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ===========================
# 5️⃣ Train-Test Split (80%-20%)
# ===========================
X_train, X_test, Y_train, Y_test = train_test_split(
    X_scaled, Y, test_size=0.2, random_state=42
)

# ===========================
# 6️⃣ KNN Classification
# ===========================
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
knn_pred = knn.predict(X_test)

# ===========================
# 7️⃣ SVM Classification
# ===========================
svm = SVC(kernel='linear')
svm.fit(X_train, Y_train)
svm_pred = svm.predict(X_test)

# ===========================
# 8️⃣ Performance Evaluation
# ===========================
print("\n=== KNN Performance ===")
print("Accuracy:", accuracy_score(Y_test, knn_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, knn_pred))

print("\n=== SVM Performance ===")
print("Accuracy:", accuracy_score(Y_test, svm_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, svm_pred))

print("\n=== Classification Report (SVM Recommended) ===")
print(classification_report(Y_test, svm_pred))



=== KNN Performance ===
Accuracy: 0.8338164251207729
Confusion Matrix:
 [[583 156]
 [ 16 280]]

=== SVM Performance ===
Accuracy: 0.9449275362318841
Confusion Matrix:
 [[705  34]
 [ 23 273]]

=== Classification Report (SVM Recommended) ===
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       739
           1       0.89      0.92      0.91       296

    accuracy                           0.94      1035
   macro avg       0.93      0.94      0.93      1035
weighted avg       0.95      0.94      0.95      1035

