Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

Load the Dataset

In [None]:
mail_data = pd.read_csv("your_dataset.csv") 

In [3]:
mail_data.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [4]:
mail_data.sample(5)

Unnamed: 0,label,text
49073,0,thanks for ordering from amazon . com ! your p...
73322,1,kahlan waved a hand allaying his fears the ob...
9344,0,original message subject r read table from ji...
29832,0,begin pgp signed message hash shaescapenumber...
22449,1,example to prepare for a minimal installation ...


Remove Null Values

In [62]:
mail_data = mail_data.rename(columns={'text': 'Message'})

In [None]:
X = mail_data['Message']
y = mail_data['label']

In [63]:
mail_data['Message'].isnull().sum()

0

In [64]:
mail_data.dropna(axis = 0, inplace = True)

In [65]:
mail_data.head()

Unnamed: 0,label,Message
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [66]:
mail_data.shape

(5171, 2)

In [69]:
print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: Message, Length: 5171, dtype: object


In [None]:
print(y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: Category, Length: 5171, dtype: int32


Splitting the data into training data & test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Feature Extraction

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.95)

In [75]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Base Models

In [None]:
svc = SVC(probability=True, kernel='linear', C=1.0, random_state=42)
nb = MultinomialNB()

Voting

In [None]:
# Ensemble: Hard Voting
hard_voting = VotingClassifier(estimators=[
    ('svc', svc),
    ('nb', nb)
], voting='hard')

# Ensemble: Soft Voting
soft_voting = VotingClassifier(estimators=[
    ('svc', svc),
    ('nb', nb)
], voting='soft')

Pipeline

In [None]:
# Pipelines
hard_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', hard_voting)
])

soft_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('voting', soft_voting)
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Accuracy Score and Classification Report

In [None]:
# Hard Voting Results
hard_scores = cross_val_score(hard_pipeline, X, y, cv=cv, scoring='accuracy')
print(f"\n[Hard Voting] Avg Accuracy: {hard_scores.mean():.4f} (+/- {hard_scores.std():.4f})")

# Soft Voting Results
soft_scores = cross_val_score(soft_pipeline, X, y, cv=cv, scoring='accuracy')
print(f"[Soft Voting] Avg Accuracy: {soft_scores.mean():.4f} (+/- {soft_scores.std():.4f})")

# Final fit and classification report
print("\nFitting on full data to get final classification report (soft voting)...")
soft_pipeline.fit(X, y)
y_pred = soft_pipeline.predict(X)
print(classification_report(y, y_pred))

SVC:
Accuracy on training data : 100.0
Accuracy on test data :  99.1304347826087

Naive Bayes:
Accuracy on training data : 96.80851063829788
Accuracy on test data :  91.88405797101449


Predictive System

In [None]:
input_mail = ["Free entry in mall. Do it today!!Win cash and much more."]
#input_mail = ["I've been looking for the right words to thank you for this breather.You have been wonderful and a blessing at all times."]
input_prediction = soft_pipeline.predict(input_mail)

# Output
if input_prediction[0] == 0:
    print(f"{input_mail[0]}  --> Ham mail")
else:
    print(f"{input_mail[0]}  --> Spam mail")

Free entry in mall. Do it today!!Win cash and much more. --> Spam mail
