In [None]:
# Classify the email using the binary classification method. Email Spam detection has two
# states: a) Normal State – Not Spam, b) Abnormal State – Spam. Use K-Nearest Neighbors and
# Support Vector Machine for classification. Analyze their performance.
# Dataset link: The emails.csv dataset on the Kaggle
# https://www.kaggle.com/datasets/balaka18/email-spam-classification-dataset-csv

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("emails.csv")  # Replace with the actual path to the dataset
data


Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [2]:
# 1. Data Preprocessing - Handle missing values if necessary
data.drop(['Email No.'],axis=1, inplace=True)
# 2. Feature Selection/Engineering - Select relevant features

In [3]:

# 3. Split the data into training and testing sets
X = data.drop("Prediction", axis=1)  # Features
y = data["Prediction"]  # Target variable
print("Features: ",X)
print("Target: ",y)

Features:        the  to  ect  and  for  of    a  you  hou  in  ...  enhancements  \
0       0   0    1    0    0   0    2    0    0   0  ...             0   
1       8  13   24    6    6   2  102    1   27  18  ...             0   
2       0   0    1    0    0   0    8    0    0   4  ...             0   
3       0   5   22    0    5   1   51    2   10   1  ...             0   
4       7   6   17    1    5   2   57    0    9   3  ...             0   
...   ...  ..  ...  ...  ...  ..  ...  ...  ...  ..  ...           ...   
5167    2   2    2    3    0   0   32    0    0   5  ...             0   
5168   35  27   11    2    6   5  151    4    3  23  ...             0   
5169    0   0    1    1    0   0   11    0    0   1  ...             0   
5170    2   7    1    0    2   1   28    2    0   8  ...             0   
5171   22  24    5    1    6   5  148    8    2  23  ...             0   

      connevey  jay  valued  lay  infrastructure  military  allowing  ff  dry  
0            0    0 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# 4. Model Building
# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [6]:
# 5. Model Evaluation
# K-Nearest Neighbors
knn_predictions = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_report = classification_report(y_test, knn_predictions)

In [7]:
print(knn_predictions)

[0 0 1 ... 0 0 0]


In [8]:
# Print or visualize the evaluation results
print("K-Nearest Neighbors Accuracy:")
print(knn_accuracy)
print("K-Nearest Neighbors Classification Report:")
print(knn_report)

K-Nearest Neighbors Accuracy:
0.8608247422680413
K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.87      0.90      1097
           1       0.73      0.83      0.78       455

    accuracy                           0.86      1552
   macro avg       0.83      0.85      0.84      1552
weighted avg       0.87      0.86      0.86      1552



In [9]:
# Support Vector Machine
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_report = classification_report(y_test, svm_predictions)

In [10]:
print(svm_predictions)

[0 0 1 ... 0 0 0]


In [11]:
print("Support Vector Machine Accuracy:")
print(svm_accuracy)
print("Support Vector Machine Classification Report:")
print(svm_report)


Support Vector Machine Accuracy:
0.803479381443299
Support Vector Machine Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.99      0.88      1097
           1       0.92      0.36      0.52       455

    accuracy                           0.80      1552
   macro avg       0.85      0.67      0.70      1552
weighted avg       0.83      0.80      0.77      1552



In [None]:
# ################# Overall Anaysis #################
# K-Nearest Neighbors (KNN):

# Overall accuracy: 0.86
# The precision for class 0 (not spam) is 0.93, indicating that 93% of the instances classified as not spam are correct.
# The recall for class 0 is 0.87, indicating that 87% of the actual not spam instances are correctly classified.
# The F1-score for class 0 is 0.90, which is the harmonic mean of precision and recall.
# For class 1 (spam), the precision is 0.73, recall is 0.83, and F1-score is 0.78.
# The macro average F1-score is 0.84, and the weighted average F1-score is 0.86.
# Support Vector Machine (SVM):

# Overall accuracy: 0.80
# The precision for class 0 (not spam) is 0.79, indicating that 79% of the instances classified as not spam are correct.
# The recall for class 0 is 0.99, indicating that 99% of the actual not spam instances are correctly classified.
# The F1-score for class 0 is 0.88.
# For class 1 (spam), the precision is 0.92, recall is 0.36, and F1-score is 0.52.
# The macro average F1-score is 0.70, and the weighted average F1-score is 0.77.
# Here's an interpretation of the results:

# KNN has a higher accuracy, precision, recall, and F1-score for the "spam" class compared to SVM.
# SVM has a higher accuracy, precision, recall, and F1-score for the "not spam" class.
# Depending on the specific problem and requirements, you can choose between KNN and SVM.
# KNN appears to be better at classifying spam, while SVM is better at classifying not spam.
# Your choice should be based on the trade-offs you are willing to make between precision and recall for each class.