In [4]:
# -------------------- Importing Required Libraries --------------------
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [5]:
# -------------------- Load Dataset --------------------

df = pd.read_csv(r"/content/emails.csv")   # <-- change path if needed

print("First 5 rows:\n", df.head())
print("\nDataset Information:")
print(df.info())
print("\nMissing values:\n", df.isnull().sum())


First 5 rows:
   Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]

Dataset Information:
<class 'pandas.core.frame

In [6]:
# -------------------- Define Features (X) and Target (Y) --------------------
# Assuming dataset columns: [Email_No., word_freq_1, ..., word_freq_n, spam]

X = df.iloc[:, 1:-1].values    # input features
Y = df.iloc[:, -1].values      # target label (1 = spam, 0 = not spam)

print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)


Shape of X: (5172, 3000)
Shape of Y: (5172,)


In [7]:
# -------------------- Split Data into Training and Testing Sets --------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42
)

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)


Training Data Shape: (3879, 3000)
Testing Data Shape: (1293, 3000)


In [8]:
# -------------------- Feature Scaling --------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Feature scaling complete.")


Feature scaling complete.


In [9]:
# =====================================================================
# ðŸ§  Support Vector Machine (SVM) Model with Hyperparameter Tuning
# =====================================================================

svc = SVC()
svc_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

svc_grid = GridSearchCV(svc, svc_params, cv=5, scoring='accuracy')
svc_grid.fit(X_train, y_train)

svc_best = svc_grid.best_estimator_
svc_pred = svc_best.predict(X_test)

print("\n--- SVM Results ---")
print("Best Parameters:", svc_grid.best_params_)
print("Accuracy:", accuracy_score(y_test, svc_pred))
print("Classification Report:\n", classification_report(y_test, svc_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, svc_pred))



--- SVM Results ---
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.9505027068832174
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96       913
           1       0.90      0.94      0.92       380

    accuracy                           0.95      1293
   macro avg       0.94      0.95      0.94      1293
weighted avg       0.95      0.95      0.95      1293

Confusion Matrix:
 [[871  42]
 [ 22 358]]


In [10]:
# =====================================================================
# ðŸ§  K-Nearest Neighbors (KNN) Model with Hyperparameter Tuning
# =====================================================================

knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

knn_grid = GridSearchCV(knn, knn_params, cv=5, scoring='accuracy')
knn_grid.fit(X_train, y_train)

knn_best = knn_grid.best_estimator_
knn_pred = knn_best.predict(X_test)

print("\n--- KNN Results ---")
print("Best Parameters:", knn_grid.best_params_)
print("Accuracy:", accuracy_score(y_test, knn_pred))
print("Classification Report:\n", classification_report(y_test, knn_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, knn_pred))



--- KNN Results ---
Best Parameters: {'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
Accuracy: 0.8739365815931941
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.84      0.90       913
           1       0.72      0.95      0.82       380

    accuracy                           0.87      1293
   macro avg       0.85      0.90      0.86      1293
weighted avg       0.90      0.87      0.88      1293

Confusion Matrix:
 [[770 143]
 [ 20 360]]


In [11]:
# =====================================================================
# ðŸ“Š Comparison Summary
# =====================================================================

print("\n================== MODEL COMPARISON ==================")
print(f"SVM Accuracy: {accuracy_score(y_test, svc_pred):.4f}")
print(f"KNN Accuracy: {accuracy_score(y_test, knn_pred):.4f}")

if accuracy_score(y_test, svc_pred) > accuracy_score(y_test, knn_pred):
    print("\nâœ… SVM performs better than KNN for this dataset.")
else:
    print("\nâœ… KNN performs better than SVM for this dataset.")



SVM Accuracy: 0.9505
KNN Accuracy: 0.8739

âœ… SVM performs better than KNN for this dataset.
