In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv(r"emails.csv")

print("âœ… Dataset Loaded Successfully!")
print("Shape of dataset:", data.shape)
print("\nSample data:\n", data.head())


âœ… Dataset Loaded Successfully!
Shape of dataset: (5172, 3002)

Sample data:
   Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows

In [3]:
if 'Email No.' in data.columns:
    data = data.drop(columns=['Email No.'])

# Separate features (X) and label (y)
X = data.drop(columns=['Prediction'])
y = data['Prediction']

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# =============================
# ðŸ”¹ Train SVM Model
# =============================
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)


In [5]:
print("\n=== KNN Model Performance ===")
print("Accuracy:", round(accuracy_score(y_test, knn_pred), 3))
print("Classification Report:\n", classification_report(y_test, knn_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, knn_pred))

print("\n=== SVM Model Performance ===")
print("Accuracy:", round(accuracy_score(y_test, svm_pred), 3))
print("Classification Report:\n", classification_report(y_test, svm_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, svm_pred))


=== KNN Model Performance ===
Accuracy: 0.863
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.87      0.90       739
           1       0.73      0.84      0.78       296

    accuracy                           0.86      1035
   macro avg       0.83      0.86      0.84      1035
weighted avg       0.87      0.86      0.87      1035

Confusion Matrix:
 [[645  94]
 [ 48 248]]

=== SVM Model Performance ===
Accuracy: 0.959
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       739
           1       0.92      0.94      0.93       296

    accuracy                           0.96      1035
   macro avg       0.95      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035

Confusion Matrix:
 [[715  24]
 [ 18 278]]


In [6]:
# This program performs Email Spam Detection using two machine learning algorithms:
# K-Nearest Neighbors (KNN) and Support Vector Machine (SVM). 
# The main goal is to classify whether an incoming email is spam or not based on its features.

# First, the program loads the dataset 'emails.csv' using pandas. 
# This dataset contains various numeric features that represent word frequencies, 
# character frequencies, or other text-based metrics extracted from emails. 
# It also includes a target column named 'Prediction' which indicates 
# whether the email is spam (1) or not spam (0).

# After loading, unnecessary columns like 'Email No.' (if present) are dropped to clean the data. 
# The remaining data is then split into two parts: 
# features (X) which contain the independent variables, 
# and label (y) which contains the dependent variable (spam or not).

# The dataset is divided into training and testing subsets 
# using train_test_split(), with 80% data for training the model 
# and 20% for testing. This ensures that the model can be trained 
# and then evaluated on unseen data for fair accuracy measurement.

# The first algorithm used is K-Nearest Neighbors (KNN). 
# KNN works on the concept of similarity â€” when a new email is given, 
# it checks the 'k' closest data points (here k=5) 
# from the training dataset and predicts the label 
# based on the majority of those neighbors. 
# The KNN model is trained using knn.fit() and tested using knn.predict().

# The second algorithm is Support Vector Machine (SVM) with a linear kernel. 
# SVM finds the best hyperplane that separates spam and non-spam emails 
# in a multi-dimensional feature space. 
# It tries to maximize the margin between both classes for better classification. 
# The model is trained using svm.fit() and predictions are made using svm.predict().

# After training both models, the program evaluates their performance. 
# For both KNN and SVM, it calculates Accuracy, Classification Report, 
# and Confusion Matrix using sklearn.metrics. 
# The accuracy tells the overall correctness of the model, 
# while the classification report provides detailed metrics like precision, recall, and F1-score. 
# The confusion matrix helps visualize how many emails were correctly and incorrectly 
# classified as spam or not spam.

# Finally, both modelsâ€™ results are printed and compared. 
# Usually, SVM performs slightly better than KNN for text-based datasets 
# because it handles high-dimensional data efficiently. 
# The code thus demonstrates a clear comparison of two classical 
# supervised learning algorithms for spam detection.
