# Email Spam Detection With ML

Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer    # convert texted data to numerical values
from sklearn.linear_model import LogisticRegression      # allows us to fit a linear model to a dataset, predict new values, and evaluate the model's performance
from sklearn.metrics import accuracy_score      # used to calculate the accuracy of a classification model

Data Collection & Data Pre-Processing

In [2]:
data = pd.read_csv('spamdataset.csv', encoding='latin-1')

In [3]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data.isnull()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,False,False,True,True,True
1,False,False,True,True,True
2,False,False,True,True,True
3,False,False,True,True,True
4,False,False,True,True,True
...,...,...,...,...,...
5567,False,False,True,True,True
5568,False,False,True,True,True
5569,False,False,True,True,True
5570,False,False,True,True,True


In [6]:
mail_data = data.where((pd.notnull(data)),'')

In [7]:
mail_data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
mail_data = mail_data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [9]:
mail_data.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
mail_data.shape

(5572, 2)

In [11]:
mail_data = mail_data.rename(columns={'v1': 'Category', 'v2': 'Message'})

In [12]:
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

Lable Encoding

In [14]:
# lable spam mail as 0; ham mail as 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

In [15]:
# separate data as lable and text
# here input column is msg and category is output column which we want to predect
X = mail_data['Message']
Y = mail_data['Category']

In [16]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [17]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
print('X_shape:',X.shape)

X_shape: (5572,)


In [19]:
print('X_Train_shape:',X_train.shape)
print('X_Train:',X_train)

X_Train_shape: (4457,)
X_Train: 3075    Mum, hope you are having a great day. Hoping t...
1787                           Yes:)sura in sun tv.:)lol.
1614    Me sef dey laugh you. Meanwhile how's my darli...
4304                Yo come over carlos will be here soon
3266                    Ok then i come n pick u at engin?
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class.
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
Name: Message, Length: 4457, dtype: object


In [20]:
print('X_Test_shape:',X_test.shape)
print('X_Test:',X_test)

X_Test_shape: (1115,)
X_Test: 2632                       I WILL CAL YOU SIR. In meeting
454     Loan for any purpose å£500 - å£75,000. Homeown...
983     LOOK AT THE FUCKIN TIME. WHAT THE FUCK YOU THI...
1282    Ever green quote ever told by Jerry in cartoon...
4610                                  Wat time Ì_ finish?
                              ...                        
4827    Lol no. Just trying to make your day a little ...
5291      Xy trying smth now. U eat already? We havent...
3325    Huh so fast... Dat means u havent finished pai...
3561    Still chance there. If you search hard you wil...
1136    Dont forget you can place as many FREE Request...
Name: Message, Length: 1115, dtype: object


Feature Extraction

In [21]:
# convert or transform the text data into feature vectores so we can used as input to the Logistic Regression

In [22]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
# lowercase should be True instead of 'True' as 'True' is considered as a string here
X_train_features = feature_extraction.fit_transform(X_train)   # fit all data into vectorized function and tranformed the numerical values
X_test_features = feature_extraction.transform(X_test)

# to convert Y_train & Y_test values in integer
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [23]:
X_test_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7766 stored elements and shape (1115, 7510)>

In [24]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34758 stored elements and shape (4457, 7510)>
  Coords	Values
  (0, 4513)	0.2909649098524696
  (0, 3380)	0.21807195185332803
  (0, 3262)	0.25877035357606315
  (0, 3136)	0.440116181574609
  (0, 2122)	0.38613577623520473
  (0, 3386)	0.3219352588930141
  (0, 6599)	0.20296878731699391
  (0, 4296)	0.3891385935794867
  (0, 3979)	0.2410582143632299
  (0, 741)	0.3219352588930141
  (1, 7443)	0.35056971070320353
  (1, 6442)	0.5652509076654626
  (1, 6417)	0.4769136859540388
  (1, 6872)	0.4306015894277422
  (1, 4061)	0.380431198316959
  (2, 5825)	0.4917598465723273
  (2, 2226)	0.413484525934624
  (2, 3917)	0.40088501350982736
  (2, 2109)	0.42972812260098503
  (2, 933)	0.4917598465723273
  (3, 7453)	0.5202633571003087
  (3, 1842)	0.3708680641487708
  (3, 1599)	0.5927091854194291
  (3, 6140)	0.4903863168693604
  (4, 1842)	0.36051481797205776
  :	:
  (4452, 4636)	0.4030918768627523
  (4453, 1762)	0.45610005640082985
  (4453, 7273)	0.578773

Training The Model Logistic Regression

In [25]:
model = LogisticRegression()

In [26]:
model.fit(X_train_features, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Evaluating The Trained Model

In [27]:
# Predict on training data

In [28]:
Predicting_on_train_data = model.predict(X_train_features)
accuracy_on_train_data = accuracy_score(Y_train, Predicting_on_train_data)

In [29]:
print('Accuracy on training data is: ', accuracy_on_train_data)

Accuracy on training data is:  0.9661207089970832


In [30]:
# 96% which means out of 100 mails our model is predict 96 mail as correctly

In [31]:
# Predict on training data
Predicting_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, Predicting_on_test_data)

In [32]:
print('Accuracy on testing data is: ', accuracy_on_test_data)

Accuracy on testing data is:  0.9623318385650225


In [33]:
# here traing data accuracy and testing data accuracy are simillar so no overfitting problem occur. for cheching overfitting that's why we check accuracy of both training and testing data

Build a predicting System

In [34]:
input_mail = ["Thanks for your subscription to Ringtone UK your mobile will be charged å£5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"]

# convert text to feature vector
input_mail_feature = feature_extraction.transform(input_mail)

# make predictin
prediction = model.predict(input_mail_feature)
print(prediction)   # Here output is 0 bcz the input mail is spam , output is 1 for ham mail.

if (prediction[0] == 1):
  print('Ham Mail')
else:
  print('Spam Mail')

[0]
Spam Mail


In [35]:
input_mail = ["Wait that's still not all that clear, were you not sure about me being sarcastic or that that's why x doesn't want to live with us"]

# convert text to feature vector
input_mail_feature = feature_extraction.transform(input_mail)

# make predictin
prediction = model.predict(input_mail_feature)
print(prediction)   # Here output is 0 bcz the input mail is spam , output is 1 for ham mail.

if (prediction[0] == 1):
  print('Ham Mail')
else:
  print('Spam Mail')

[1]
Ham Mail


In [36]:
input_mail = ["Congratulations! You have been selected to receive a FREE iPad! Just click the link below to claim your prize.This offer is only available for a limited time. Don't miss out!"]

# convert text to feature vector
input_mail_feature = feature_extraction.transform(input_mail)

# make predictin
prediction = model.predict(input_mail_feature)
print(prediction)   # Here output is 0 bcz the input mail is spam , output is 1 for ham mail.

if (prediction[0] == 1):
  print('Ham Mail')
else:
  print('Spam Mail')

[0]
Spam Mail


In [39]:
input_mail = ["We’re writing to inform you that starting May 12, 2025, you will be required to use multi-factor authentication (MFA), also known as 2-Step Verification (2SV), to access Google Cloud console, gcloud CLI, and Firebase console. You’re receiving this notice because you have access to projects on Google Cloud."]

# convert text to feature vector
input_mail_feature = feature_extraction.transform(input_mail)

# make predictin
prediction = model.predict(input_mail_feature)
print(prediction)   # Here output is 0 bcz the input mail is spam , output is 1 for ham mail.

if (prediction[0] == 1):
  print('Ham Mail')
else:
  print('Spam Mail')

[1]
Ham Mail


In [40]:
input_mail = ["You have been selected as the lucky winner of the International Lottery! Your email was randomly chosen, and you are entitled to receive $1,000,000.To claim your prize, provide your full name, address, phone number, and bank details. Act fast, or your prize will be forfeited!"]

# convert text to feature vector
input_mail_feature = feature_extraction.transform(input_mail)

# make predictin
prediction = model.predict(input_mail_feature)
print(prediction)   # Here output is 0 bcz the input mail is spam , output is 1 for ham mail.

if (prediction[0] == 1):
  print('Ham Mail')
else:
  print('Spam Mail')

[0]
Spam Mail
