In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the dataset
file_path = 'spam.csv'
data = pd.read_csv(file_path)


In [10]:
# Inspect the data
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [11]:
print("\nInfo about the dataset:")
print(data.info())



Info about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [12]:
print("\nColumn names in the dataset:")
print(data.columns)


Column names in the dataset:
Index(['Category', 'Message'], dtype='object')


In [27]:
email_column = 'Message'  # Replace with the correct column name if different
label_column = 'Category'    


In [28]:
# Data preprocessing
data = data[[email_column, label_column]]
data = data.dropna()

In [29]:
# Convert labels to binary (0 for ham, 1 for spam)
data[label_column] = data[label_column].map({'ham': 0, 'spam': 1})


In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[email_column], data[label_column], test_size=0.2, random_state=42)

In [31]:
# Vectorize the email text
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [32]:
# Function to train and evaluate models
def train_and_evaluate(model, model_name):
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    print(f"\n{model_name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")
    print(f"Confusion Matrix for {model_name}:\n{confusion_matrix(y_test, y_pred)}")


In [33]:
# Train and evaluate models
# 1. Logistic Regression
log_reg = LogisticRegression()
train_and_evaluate(log_reg, "Logistic Regression")


Logistic Regression Accuracy: 0.9865470852017937
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.90      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix for Logistic Regression:
[[966   0]
 [ 15 134]]


In [34]:
# 2. Random Forest Classifier
rf_clf = RandomForestClassifier()
train_and_evaluate(rf_clf, "Random Forest Classifier")



Random Forest Classifier Accuracy: 0.9811659192825112
Classification Report for Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.86      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix for Random Forest Classifier:
[[966   0]
 [ 21 128]]


In [35]:
# 3. AdaBoosting Classifier
ada_clf = AdaBoostClassifier()
train_and_evaluate(ada_clf, "AdaBoosting Classifier")



AdaBoosting Classifier Accuracy: 0.9704035874439462
Classification Report for AdaBoosting Classifier:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       966
           1       0.93      0.84      0.88       149

    accuracy                           0.97      1115
   macro avg       0.95      0.91      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix for AdaBoosting Classifier:
[[957   9]
 [ 24 125]]


In [36]:
# 4. K-Nearest Neighbors
knn_clf = KNeighborsClassifier()
train_and_evaluate(knn_clf, "K-Nearest Neighbors")


K-Nearest Neighbors Accuracy: 0.9264573991031391
Classification Report for K-Nearest Neighbors:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       966
           1       1.00      0.45      0.62       149

    accuracy                           0.93      1115
   macro avg       0.96      0.72      0.79      1115
weighted avg       0.93      0.93      0.91      1115

Confusion Matrix for K-Nearest Neighbors:
[[966   0]
 [ 82  67]]
