In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Read the dataset and check only first few columns
data = pd.read_csv("emails.csv")

print("Dataset Loaded Successfully")
print("Shape:", data.shape)
print("\nFirst 10 column names:")
print(list(data.columns[:10]))

print("\nLast column name (target):", data.columns[-1])


Dataset Loaded Successfully
Shape: (5172, 3002)

First 10 column names:
['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou']

Last column name (target): Prediction


In [3]:
# 3 Prepare Data - Clean and Separate Features & Target

# Make column names consistent
data.columns = data.columns.str.strip()        # remove spaces
data.columns = data.columns.str.replace('\n', '')  # remove newlines if any

# Drop non-numeric / ID column safely
X = data.drop(columns=['Email No.', 'Prediction'], errors='ignore')
y = data['Prediction']

# Convert all remaining features to numeric if possible
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

# Confirm if any text columns remain
print(" Data cleaned and prepared.")
print("Object columns (should be empty):", X.select_dtypes(include='object').columns)


 Data cleaned and prepared.
Object columns (should be empty): Index([], dtype='object')


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split successfully!")
print("Training size:", X_train.shape)
print("Testing size:", X_test.shape)


Data split successfully!
Training size: (4137, 3000)
Testing size: (1035, 3000)


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("KNN Accuracy:", round(accuracy_score(y_test, y_pred_knn), 3))
print("\nKNN Classification Report:\n", classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.869

KNN Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.88      0.91       739
           1       0.74      0.83      0.78       296

    accuracy                           0.87      1035
   macro avg       0.83      0.86      0.84      1035
weighted avg       0.88      0.87      0.87      1035



In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

ks = [1, 3, 5] 

results = {}
for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_train_s, y_train)              # X_train_s must be scaled features
    y_pred = knn.predict(X_test_s)          # X_test_s must be scaled features

    acc = accuracy_score(y_test, y_pred)
    cm  = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)

    results[k] = acc

    print(f"\nK = {k}:")
    print(f"  Accuracy = {acc:.4f}")
    print("  Confusion Matrix:")
    print(cm)
    print("  Classification Report:")
    print(report)


K = 1:
  Accuracy = 0.9005
  Confusion Matrix:
[[661  78]
 [ 25 271]]
  Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.89      0.93       739
           1       0.78      0.92      0.84       296

    accuracy                           0.90      1035
   macro avg       0.87      0.90      0.88      1035
weighted avg       0.91      0.90      0.90      1035


K = 3:
  Accuracy = 0.8657
  Confusion Matrix:
[[619 120]
 [ 19 277]]
  Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.84      0.90       739
           1       0.70      0.94      0.80       296

    accuracy                           0.87      1035
   macro avg       0.83      0.89      0.85      1035
weighted avg       0.89      0.87      0.87      1035


K = 5:
  Accuracy = 0.8454
  Confusion Matrix:
[[593 146]
 [ 14 282]]
  Classification Report:
              precision    recall  f1-score   support

   

In [8]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print("SVM Accuracy:", round(accuracy_score(y_test, y_pred_svm), 3))
print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.959

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       739
           1       0.92      0.94      0.93       296

    accuracy                           0.96      1035
   macro avg       0.95      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [9]:
acc_knn = accuracy_score(y_test, y_pred_knn)
acc_svm = accuracy_score(y_test, y_pred_svm)

print("Model Comparison:")
print("KNN Accuracy:", round(acc_knn, 3))
print("SVM Accuracy:", round(acc_svm, 3))

if acc_svm > acc_knn:
    print("\nSVM performed better.")
else:
    print("\nKNN performed better.")


Model Comparison:
KNN Accuracy: 0.869
SVM Accuracy: 0.959

SVM performed better.
