Importing Libraries

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
import time

Load the Dataset

In [19]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/combined_data.csv'
mail_data = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
mail_data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [21]:
mail_data.sample(5)

Unnamed: 0,label,text
58455,0,fyi - - - - i am working on getting that overs...
29209,0,okay i know that ubuntu contains a graphical i...
15501,1,submitting your website in search engines may ...
62607,1,did you know you can refinance up to escapenum...
62437,0,"after meeting with aep this morning , some iss..."


Remove Null Values

In [22]:
mail_data = mail_data.rename(columns={'text': 'Message'})

In [23]:
X = mail_data['Message']
y = mail_data['label']

In [24]:
mail_data['Message'].isnull().sum()

np.int64(0)

In [25]:
mail_data.dropna(axis = 0, inplace = True)

In [26]:
X.head()

Unnamed: 0,Message
0,ounce feather bowl hummingbird opec moment ala...
1,wulvob get your medircations online qnb ikud v...
2,computer connection from cnn com wednesday es...
3,university degree obtain a prosperous future m...
4,thanks for all your answers guys i know i shou...


In [27]:
mail_data.shape

(83448, 2)

In [28]:
print(X)

0        ounce feather bowl hummingbird opec moment ala...
1        wulvob get your medircations online qnb ikud v...
2         computer connection from cnn com wednesday es...
3        university degree obtain a prosperous future m...
4        thanks for all your answers guys i know i shou...
                               ...                        
83443    hi given a date how do i get the last date of ...
83444    now you can order software on cd or download i...
83445    dear valued member canadianpharmacy provides a...
83446    subscribe change profile contact us long term ...
83447    get the most out of life ! viagra has helped m...
Name: Message, Length: 83448, dtype: object


In [29]:
print(y)

0        1
1        1
2        0
3        1
4        0
        ..
83443    0
83444    1
83445    1
83446    0
83447    1
Name: label, Length: 83448, dtype: int64


In [30]:
# Replace these with your actual data
X = np.array(X)  # list or array of raw text samples
y = np.array(y)  # labels

Splitting the data into training data & test data

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 0,stratify=y)

Vectorization + Base Models + Voting Pipeline

In [32]:
# Feature extraction
tfidf = TfidfVectorizer(max_features=5000)

# Base models
svc = LinearSVC()
nb = MultinomialNB()
log_reg = LogisticRegression(max_iter=500)

# Voting Classifiers
hard_voting = VotingClassifier(estimators=[
    ('svc', svc),
    ('nb', nb)
], voting='hard')

soft_voting = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('nb', nb)
], voting='soft')

# Pipelines
hard_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', hard_voting)
])

soft_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', soft_voting)
])

# Cross-validation on training set only
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Hard Voting CV (on training data):")
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, Y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = Y_train[train_idx], Y_train[val_idx]
    start = time.time()
    hard_pipeline.fit(X_tr, y_tr)
    score = hard_pipeline.score(X_val, y_val)
    print(f"Fold {fold+1} — Accuracy: {score:.4f}, Time: {time.time() - start:.2f}s")

print("\nSoft Voting CV (on training data):")
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, Y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = Y_train[train_idx], Y_train[val_idx]
    start = time.time()
    soft_pipeline.fit(X_tr, y_tr)
    score = soft_pipeline.score(X_val, y_val)
    print(f"Fold {fold+1} — Accuracy: {score:.4f}, Time: {time.time() - start:.2f}s")

Hard Voting CV (on training data):
Fold 1 — Accuracy: 0.9688, Time: 19.51s
Fold 2 — Accuracy: 0.9721, Time: 17.05s
Fold 3 — Accuracy: 0.9704, Time: 16.83s
Fold 4 — Accuracy: 0.9728, Time: 19.52s
Fold 5 — Accuracy: 0.9729, Time: 16.65s

Soft Voting CV (on training data):
Fold 1 — Accuracy: 0.9754, Time: 15.99s
Fold 2 — Accuracy: 0.9771, Time: 15.60s
Fold 3 — Accuracy: 0.9743, Time: 15.75s
Fold 4 — Accuracy: 0.9783, Time: 18.25s
Fold 5 — Accuracy: 0.9787, Time: 16.55s


Accuracy Score and Classification Report

In [33]:
# Final test set evaluation
hard_pipeline.fit(X_train, Y_train)
print("\nFinal Hard Voting Accuracy on Test Set:", hard_pipeline.score(X_test, Y_test))

soft_pipeline.fit(X_train, Y_train)
print("Final Soft Voting Accuracy on Test Set:", soft_pipeline.score(X_test, Y_test))


Final Hard Voting Accuracy on Test Set: 0.9718993409227082
Final Soft Voting Accuracy on Test Set: 0.9772318753744758


Predictive System

In [34]:
input_mail = ["Free entry in mall. Do it today!!Win cash and much more."]
#input_mail = ["I've been looking for the right words to thank you for this breather.You have been wonderful and a blessing at all times."]
input_prediction = soft_pipeline.predict(input_mail)

# Output
if input_prediction[0] == 0:
    print(f"{input_mail[0]}  --> Ham mail")
else:
    print(f"{input_mail[0]}  --> Spam mail")

Free entry in mall. Do it today!!Win cash and much more.  --> Spam mail
