Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
import time

Load the Dataset

In [19]:
mail_data = pd.read_csv("data/combined_data.csv")

In [20]:
mail_data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [21]:
mail_data.sample(5)

Unnamed: 0,label,text
56666,0,was the internal ownership policy attachment o...
58730,0,thursday escapenumber escapenumber escapenumbe...
77083,1,and marketing pitches play is a simple balance...
40041,1,the white brilliance of words knowledge things...
37731,1,does size matterescapenumber escapenumber of w...


In [34]:
mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    83448 non-null  int64 
 1   Message  83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [22]:
mail_data = mail_data.rename(columns={'text': 'Message'})

In [23]:
mail_data['Message'].isnull().sum()

0

In [24]:
mail_data.dropna(axis = 0, inplace = True)

In [25]:
mail_data.shape

(83448, 2)

In [26]:
X = mail_data['Message']
y = mail_data['label']

In [27]:
X

0        ounce feather bowl hummingbird opec moment ala...
1        wulvob get your medircations online qnb ikud v...
2         computer connection from cnn com wednesday es...
3        university degree obtain a prosperous future m...
4        thanks for all your answers guys i know i shou...
                               ...                        
83443    hi given a date how do i get the last date of ...
83444    now you can order software on cd or download i...
83445    dear valued member canadianpharmacy provides a...
83446    subscribe change profile contact us long term ...
83447    get the most out of life ! viagra has helped m...
Name: Message, Length: 83448, dtype: object

In [28]:
y

0        1
1        1
2        0
3        1
4        0
        ..
83443    0
83444    1
83445    1
83446    0
83447    1
Name: label, Length: 83448, dtype: int64

In [29]:
X = np.array(X) 
y = np.array(y)  

Splitting the data into training data & test data

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify=y)

Vectorization + Models + Voting Pipeline

In [31]:
tfidf = TfidfVectorizer(max_features=5000)

# Base models
svc = LinearSVC()
nb = MultinomialNB()
log_reg = LogisticRegression(max_iter=500)

# Voting Classifiers
hard_voting = VotingClassifier(estimators=[
    ('svc', svc),
    ('nb', nb)
], voting='hard')

soft_voting = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('nb', nb)
], voting='soft')

# Pipelines
hard_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', hard_voting)
])

soft_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', soft_voting)
])

# Cross-validation on training set only
cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)

print("Hard Voting CV (on training data):")
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, Y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = Y_train[train_idx], Y_train[val_idx]
    start = time.time()
    hard_pipeline.fit(X_tr, y_tr)
    score = hard_pipeline.score(X_val, y_val)
    print(f"Fold {fold+1} — Accuracy: {score:.4f}, Time: {time.time() - start:.2f}s")

print("\nSoft Voting CV (on training data):")
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, Y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = Y_train[train_idx], Y_train[val_idx]
    start = time.time()
    soft_pipeline.fit(X_tr, y_tr)
    score = soft_pipeline.score(X_val, y_val)
    print(f"Fold {fold+1} — Accuracy: {score:.4f}, Time: {time.time() - start:.2f}s")

Hard Voting CV (on training data):


Fold 1 — Accuracy: 0.9692, Time: 25.49s
Fold 2 — Accuracy: 0.9697, Time: 35.21s
Fold 3 — Accuracy: 0.9728, Time: 36.48s
Fold 4 — Accuracy: 0.9721, Time: 33.40s
Fold 5 — Accuracy: 0.9700, Time: 33.77s
Fold 6 — Accuracy: 0.9729, Time: 34.02s
Fold 7 — Accuracy: 0.9724, Time: 33.56s
Fold 8 — Accuracy: 0.9750, Time: 32.71s

Soft Voting CV (on training data):
Fold 1 — Accuracy: 0.9768, Time: 26.31s
Fold 2 — Accuracy: 0.9753, Time: 33.03s
Fold 3 — Accuracy: 0.9788, Time: 34.29s
Fold 4 — Accuracy: 0.9752, Time: 33.45s
Fold 5 — Accuracy: 0.9750, Time: 33.90s
Fold 6 — Accuracy: 0.9803, Time: 35.89s
Fold 7 — Accuracy: 0.9776, Time: 33.69s
Fold 8 — Accuracy: 0.9788, Time: 30.49s


Accuracy Score and Classification Report

In [32]:
# Final test set evaluation
hard_pipeline.fit(X_train, Y_train)
hard_y_pred = hard_pipeline.predict(X_test)
print("Hard Voting:")
print("\nFinal Accuracy on Test Set:", hard_pipeline.score(X_test, Y_test))
print("Classification Report:", classification_report(Y_test, hard_y_pred))

soft_pipeline.fit(X_train, Y_train)
soft_y_pred = soft_pipeline.predict(X_test)
print("\nSoft Voting:")
print("Final Soft Voting Accuracy on Test Set:", soft_pipeline.score(X_test, Y_test))
print("Classification Report:", classification_report(Y_test, soft_y_pred))

Hard Voting:

Final Accuracy on Test Set: 0.9718993409227082
Classification Report:               precision    recall  f1-score   support

           0       0.95      0.99      0.97      7908
           1       0.99      0.96      0.97      8782

    accuracy                           0.97     16690
   macro avg       0.97      0.97      0.97     16690
weighted avg       0.97      0.97      0.97     16690


Soft Voting:
Final Soft Voting Accuracy on Test Set: 0.9772318753744758
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.97      0.98      7908
           1       0.98      0.98      0.98      8782

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690



Predictive System

In [33]:
input_mail = ["Free entry in mall. Do it today!!Win cash and much more."]
input_prediction = soft_pipeline.predict(input_mail)

# Output
if input_prediction[0] == 0:
    print(f"{input_mail[0]}  --> Ham mail")
else:
    print(f"{input_mail[0]}  --> Spam mail")

Free entry in mall. Do it today!!Win cash and much more.  --> Spam mail
