In [1]:
# -------------------------------------------------------------
# ðŸ“Œ Email Spam Detection using KNN and SVM
# -------------------------------------------------------------

# Step 1: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -------------------------------------------------------------
# Step 2: Load the dataset
# -------------------------------------------------------------
data = pd.read_csv("emails.csv")  # update path if needed
print("Dataset shape:", data.shape)
data.head()

# -------------------------------------------------------------
# Step 3: Data cleaning and preprocessing
# -------------------------------------------------------------
# Drop missing values
data = data.dropna(subset=['text'])

# Remove empty messages
data = data[data['text'].str.strip() != ""]

# Encode labels: spam â†’ 1, ham â†’ 0
label_encoder = LabelEncoder()
data['spam'] = label_encoder.fit_transform(data['spam'])

# -------------------------------------------------------------
# Step 4: Feature extraction using TF-IDF
# -------------------------------------------------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(data['text']).toarray()
y = data['spam']

# -------------------------------------------------------------
# Step 5: Split dataset
# -------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------------------------------------
# Step 6: Model 1 - K-Nearest Neighbors
# -------------------------------------------------------------
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# -------------------------------------------------------------
# Step 7: Model 2 - Support Vector Machine
# -------------------------------------------------------------
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# -------------------------------------------------------------
# Step 8: Evaluate and compare performance
# -------------------------------------------------------------
acc_knn = accuracy_score(y_test, y_pred_knn)
acc_svm = accuracy_score(y_test, y_pred_svm)

print("\nðŸ“Š Model Performance:")
print(f"KNN Accuracy: {acc_knn * 100:.2f}%")
print(f"SVM Accuracy: {acc_svm * 100:.2f}%")

print("\nKNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

# -------------------------------------------------------------
# Step 9: Confusion Matrix Visualization
# -------------------------------------------------------------
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot=True, fmt='d', cmap='Blues', ax=ax[0])
ax[0].set_title("KNN Confusion Matrix")
ax[0].set_xlabel("Predicted")
ax[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, fmt='d', cmap='Greens', ax=ax[1])
ax[1].set_title("SVM Confusion Matrix")
ax[1].set_xlabel("Predicted")
ax[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# -------------------------------------------------------------
# Step 10: Accuracy Comparison Plot
# -------------------------------------------------------------
models = ['KNN', 'SVM']
accuracy_scores = [acc_knn * 100, acc_svm * 100]

plt.figure(figsize=(6,4))
sns.barplot(x=models, y=accuracy_scores, palette='viridis')
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy (%)")
plt.ylim(0, 100)
plt.show()


Dataset shape: (5172, 3002)


AttributeError: Can only use .str accessor with string values!