Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
import time

Load the Dataset

In [2]:
mail_data = pd.read_csv("data/combined_data.csv")

In [3]:
mail_data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [4]:
mail_data.sample(5)

Unnamed: 0,label,text
15868,1,prozac\nvlagra\nphentermlne\nsoma\namblen\nval...
76344,1,"remember across turn stay using pie "" usedto n..."
54148,1,hi my name is julie . i am from texas . one da...
5472,1,guggenheim roombellamy darn adolphcoachwork\nb...
40452,1,chen said quietly is you may wear it was a wee...


Remove Null Values

In [5]:
mail_data = mail_data.rename(columns={'text': 'Message'})

In [6]:
mail_data['Message'].isnull().sum()

0

In [7]:
mail_data.dropna(axis = 0, inplace = True)

In [8]:
mail_data.shape

(83448, 2)

In [9]:
X = mail_data['Message']
y = mail_data['label']

In [10]:
X

0        ounce feather bowl hummingbird opec moment ala...
1        wulvob get your medircations online qnb ikud v...
2         computer connection from cnn com wednesday es...
3        university degree obtain a prosperous future m...
4        thanks for all your answers guys i know i shou...
                               ...                        
83443    hi given a date how do i get the last date of ...
83444    now you can order software on cd or download i...
83445    dear valued member canadianpharmacy provides a...
83446    subscribe change profile contact us long term ...
83447    get the most out of life ! viagra has helped m...
Name: Message, Length: 83448, dtype: object

In [11]:
y

0        1
1        1
2        0
3        1
4        0
        ..
83443    0
83444    1
83445    1
83446    0
83447    1
Name: label, Length: 83448, dtype: int64

In [12]:
X = np.array(X) 
y = np.array(y)  

Splitting the data into training data & test data

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify=y)

Vectorization + Models + Voting Pipeline

In [None]:
tfidf = TfidfVectorizer(max_features=5000)

# Base models
svc = LinearSVC()
nb = MultinomialNB()
log_reg = LogisticRegression(max_iter=500)

# Voting Classifiers
hard_voting = VotingClassifier(estimators=[
    ('svc', svc),
    ('nb', nb)
], voting='hard')

soft_voting = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('nb', nb)
], voting='soft')

# Pipelines
hard_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', hard_voting)
])

soft_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', soft_voting)
])

# Cross-validation on training set only
cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)

print("Hard Voting CV (on training data):")
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, Y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = Y_train[train_idx], Y_train[val_idx]
    start = time.time()
    hard_pipeline.fit(X_tr, y_tr)
    score = hard_pipeline.score(X_val, y_val)
    print(f"Fold {fold+1} — Accuracy: {score:.4f}, Time: {time.time() - start:.2f}s")

print("\nSoft Voting CV (on training data):")
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, Y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = Y_train[train_idx], Y_train[val_idx]
    start = time.time()
    soft_pipeline.fit(X_tr, y_tr)
    score = soft_pipeline.score(X_val, y_val)
    print(f"Fold {fold+1} — Accuracy: {score:.4f}, Time: {time.time() - start:.2f}s")

Hard Voting CV (on training data):
Fold 1 — Accuracy: 0.9688, Time: 26.01s
Fold 2 — Accuracy: 0.9723, Time: 34.08s
Fold 3 — Accuracy: 0.9704, Time: 25.15s
Fold 4 — Accuracy: 0.9729, Time: 33.39s
Fold 5 — Accuracy: 0.9730, Time: 34.07s

Soft Voting CV (on training data):
Fold 1 — Accuracy: 0.9754, Time: 36.81s
Fold 2 — Accuracy: 0.9772, Time: 26.84s
Fold 3 — Accuracy: 0.9743, Time: 27.85s
Fold 4 — Accuracy: 0.9782, Time: 36.18s
Fold 5 — Accuracy: 0.9786, Time: 29.33s


Accuracy Score and Classification Report

In [15]:
# Final test set evaluation
hard_pipeline.fit(X_train, Y_train)
hard_y_pred = hard_pipeline.predict(X_test)
print("Hard Voting:")
print("\nFinal Accuracy on Test Set:", hard_pipeline.score(X_test, Y_test))
print("Classification Report:", classification_report(Y_test, hard_y_pred))

soft_pipeline.fit(X_train, Y_train)
soft_y_pred = soft_pipeline.predict(X_test)
print("\nSoft Voting:")
print("Final Soft Voting Accuracy on Test Set:", soft_pipeline.score(X_test, Y_test))
print("Classification Report:", classification_report(Y_test, soft_y_pred))

Hard Voting:

Final Accuracy on Test Set: 0.9718993409227082
Classification Report:               precision    recall  f1-score   support

           0       0.95      0.99      0.97      7908
           1       0.99      0.96      0.97      8782

    accuracy                           0.97     16690
   macro avg       0.97      0.97      0.97     16690
weighted avg       0.97      0.97      0.97     16690


Soft Voting:
Final Soft Voting Accuracy on Test Set: 0.9772318753744758
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.97      0.98      7908
           1       0.98      0.98      0.98      8782

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690



Predictive System

In [None]:
input_mail = ["Free entry in mall. Do it today!!Win cash and much more."]
input_prediction = soft_pipeline.predict(input_mail)

# Output
if input_prediction[0] == 0:
    print(f"{input_mail[0]}  --> Ham mail")
else:
    print(f"{input_mail[0]}  --> Spam mail")

I've been looking for the right words to thank you for this breather.You have been wonderful and a blessing at all times.  --> Spam mail
