In [3]:
# Importing the pandas library for data manipulation and analysis
import pandas as pd

# Reading the dataset named 'Emails.csv' into a pandas DataFrame
df = pd.read_csv('emails.csv')


In [5]:
# Print the list of all column names present in the dataset
print(df.columns)

# Display the first 5 rows of the dataset to understand its structure and sample data
print(df.head())

# Count and display how many times each unique value appears in the 'Prediction' column
# (e.g., how many are spam vs. not spam emails)
print(df['Prediction'].value_counts())

Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)
  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...       0.0  0.0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...       0.0  0.0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...       0.0  0.0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...       0.0  0.0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...       0.0  0.0   

   valued  lay  infrastructure  military  allowing   ff  dry  Prediction  
0     0.0  0.0             0.0       0.0       0.0  0.0  0.0         0.0  
1     0.0  0.0             0.0       0.0       0.0  1.0  0.0         0.0  
2     0.0  0.0             0.0       0.0       0.0  0.0  0.0 

In [4]:
# Create the feature matrix 'X' by dropping the target column 'Prediction'
# and the identifier column 'Email No.' since they are not needed for training
X = df.drop(columns=['Prediction', 'Email No.'])

# Create the target vector 'y' that contains only the 'Prediction' column
# (the label we want the model to predict — e.g., spam or not spam)
y = df['Prediction']


In [5]:
# Import the train_test_split function to divide data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing parts
#  - X_train, y_train → used to train the model
#  - X_test, y_test → used to evaluate the model
# test_size=0.2 → 20% of the data will be used for testing
# random_state=42 → ensures the split is reproducible (same split every time)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Import the K-Nearest Neighbors (KNN) classifier algorithm
from sklearn.neighbors import KNeighborsClassifier

# Import functions to evaluate the model (accuracy, confusion matrix, and classification report)
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the KNN classifier with 5 nearest neighbors
knn = KNeighborsClassifier(n_neighbors=5)

# Train (fit) the KNN model using the training data
knn.fit(X_train, y_train)

# Predict the target values (spam or not spam) for the test dataset
knn_pred = knn.predict(X_test)

# Print section title for clarity in output
print("-------svc----------")

# Calculate and print the model’s accuracy score (percentage of correct predictions)
print("accuracy: ", accuracy_score(y_test, knn_pred))

# Display the confusion matrix to show correct and incorrect classifications
print("confusion matrix: \n", confusion_matrix(y_test, knn_pred))

# Print detailed classification report including precision, recall, and F1-score
print("classification report: \n", classification_report(y_test, knn_pred))

-------svc----------
accuracy:  0.8628019323671497
confusion matrix: 
 [[646  93]
 [ 49 247]]
classification report: 
               precision    recall  f1-score   support

           0       0.93      0.87      0.90       739
           1       0.73      0.83      0.78       296

    accuracy                           0.86      1035
   macro avg       0.83      0.85      0.84      1035
weighted avg       0.87      0.86      0.87      1035



In [7]:
# Import the Support Vector Machine (SVM) classifier from sklearn
from sklearn.svm import SVC

# Initialize the SVM model with a linear kernel (best for linearly separable data)
svm = SVC(kernel='linear')

# Train (fit) the SVM model on the training data
svm.fit(X_train, y_train)

# Predict the target values for the test dataset using the trained model
svm_pred = svm.predict(X_test)

# Print a section title to separate SVM results from other models in output
print("-------svc----------")

# Calculate and print the accuracy score (percentage of correct predictions)
print("accuracy: ", accuracy_score(y_test, svm_pred))

# Display the confusion matrix showing true vs predicted classifications
print("confusion matrix: \n", confusion_matrix(y_test, svm_pred))

# Print detailed classification metrics: precision, recall, F1-score, and support
print("classification report: \n", classification_report(y_test, svm_pred))


-------svc----------
accuracy:  0.9594202898550724
confusion matrix: 
 [[715  24]
 [ 18 278]]
classification report: 
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       739
           1       0.92      0.94      0.93       296

    accuracy                           0.96      1035
   macro avg       0.95      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035

