### Email spam detection with machine learning

In [1]:
# Importing necessary libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read the dataset

d=pd.read_csv("spam.csv", encoding = ("ISO-8859-1"), low_memory = False)

In [3]:
# To display first five rows

d.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# To display last five rows

d.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [5]:
# To display the columns

d.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [6]:
# To display basic information

d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
# We can all the unnamed columns from the dataset.

d= d.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis = 1)

In [8]:
# Check for null values

d.isnull().sum()

v1    0
v2    0
dtype: int64

In [9]:
# To check if there are any duplicates in the dataset.

d.duplicated().sum()

403

In [10]:
# There are 403 duplicate values and we can drop all of them.

d = d.drop_duplicates(keep = 'first')



# To check again if there are any duplicates in the dataset.

d.duplicated().sum()

0

In [11]:
# Here there are no specific summary statistics only some count and unique values

d.describe()

Unnamed: 0,v1,v2
count,5169,5169
unique,2,5169
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


In [12]:
# For convenience we can rename the column names.

d.rename(columns = {"v1" : "Spam or Ham", "v2":"Mail"},inplace = True)
d.head()

Unnamed: 0,Spam or Ham,Mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
# To display number of samples on each class.

d["Spam or Ham"].value_counts()

ham     4516
spam     653
Name: Spam or Ham, dtype: int64

In [14]:
# We can assign numerical values to the catergorical variables using label encoding

encoder = LabelEncoder()
d["Spam or Ham"] = encoder.fit_transform(d["Spam or Ham"])

In [15]:
# Checking dataset

d.head()

Unnamed: 0,Spam or Ham,Mail
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# Spliting the dataset.

X = d["Mail"]
y = d["Spam or Ham"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [18]:
# Here naive bayes algorithm is used for classification 
# And it is necessary to make use of a vectorizer like tfidf or count vectorizer.

tfidf = TfidfVectorizer()

In [19]:
X_train_count = tfidf.fit_transform(X_train.values)
X_train_count.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
#We can create a multinomial nb model

model = MultinomialNB()
model.fit(X_train_count, y_train) 

MultinomialNB()

In [21]:
# Check if the model is predicting correct.

mail_ham = ['Mmm so yummy babe ... Nice jolt to the suzy']
mail_ham_count = tfidf.transform(mail_ham)
y_pred = model.predict(mail_ham_count)
y_pred

array([0])

In [22]:
# Check the accuracy

X_test_count = tfidf.transform(X_test)
model.score(X_test_count,y_test)

0.9445519019987105

The model score is 0.94. So the model has 94% accuracy. It is a good model. It can make predictions well.