# Setting Up the Environment and Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# Loading the Dataset

In [13]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [15]:
if 'Unnamed: 2' in df.columns:
        df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [19]:
df.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)
print("Spam detection dataset loaded successfully!")

Spam detection dataset loaded successfully!


In [21]:
print("\nFirst 5 rows of the spam dataset:")
print(df.head())

print("\nRaw Column Names:")
print(df.columns.tolist())


First 5 rows of the spam dataset:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Raw Column Names:
['label', 'text']


# Initial Data Exploration

In [26]:
print("\nDatset Information:")
df.info()


Datset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [28]:
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
label    0
text     0
dtype: int64


In [30]:
print("\nDistribution of labels (spam vs. ham):")
print(df['label'].value_counts())


Distribution of labels (spam vs. ham):
label
ham     4825
spam     747
Name: count, dtype: int64


In [32]:
df['label_numeric'] = df['label'].map({'ham': 0, 'spam': 1})
print("\nLabel distribution (numeric):")
print(df['label_numeric'].value_counts())


Label distribution (numeric):
label_numeric
0    4825
1     747
Name: count, dtype: int64


# Text Preprocessing and Feature Extraction

In [35]:
X = df['text']
y = df['label_numeric']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data samples: {len(X_train)}")
print(f"Testing data samples: {len(X_test)}")


Training data samples: 4457
Testing data samples: 1115


In [37]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Model Training and Evaluation

In [40]:
spam_detector_pipeline = Pipeline(steps=[
    ('tfidf', tfidf_vectorizer), # Step 1: Text vectorization
    ('classifier', MultinomialNB()) # Step 2: Classification model
])

In [44]:
print("\nTraining spam detection model...")
spam_detector_pipeline.fit(X_train, y_train)
print("Model training complete.")


Training spam detection model...
Model training complete.


In [46]:
y_pred = spam_detector_pipeline.predict(X_test)

print("\n--- Model Evaluation ---")
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


--- Model Evaluation ---
Accuracy Score: 0.9721973094170404

Classification Report:
               precision    recall  f1-score   support

         Ham       0.97      1.00      0.98       965
        Spam       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
 [[965   0]
 [ 31 119]]


In [48]:
print("\nPerforming 5-Fold Cross-Validation for Spam Detector...")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_accuracy_scores = cross_val_score(spam_detector_pipeline, X, y, cv=kf, scoring='accuracy')

print(f"Cross-validation Accuracy Scores: {cv_accuracy_scores}")
print(f"Mean Cross-validation Accuracy: {cv_accuracy_scores.mean():.4f}")
print(f"Standard Deviation of Cross-validation Accuracy: {cv_accuracy_scores.std():.4f}")


Performing 5-Fold Cross-Validation for Spam Detector...
Cross-validation Accuracy Scores: [0.97219731 0.97757848 0.97576302 0.97307002 0.96678636]
Mean Cross-validation Accuracy: 0.9731
Standard Deviation of Cross-validation Accuracy: 0.0037


# Conclusion