# Email spam Detection with Machine Learning

## Import the libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (classification_report, accuracy_score, 
                             confusion_matrix, precision_score, 
                             recall_score, f1_score, 
                             roc_auc_score, roc_curve)

## Load the data and print the first 5 rows

In [2]:
df = pd.read_csv("emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.shape

(5728, 2)

In [4]:
df.columns

Index(['text', 'spam'], dtype='object')

## Check for duplicates and remove them

In [5]:
df.drop_duplicates(inplace=True)
print(df.shape)

(5695, 2)


## See the number of missing data for each column

In [6]:
print(df.isnull().sum())

text    0
spam    0
dtype: int64


## Download the stop words

In [7]:
# download the stopwords package
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Define the process function to clean and tokenize the text
def process(text):
    # Remove punctuation from the text
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    # Remove stopwords and convert words to lowercase
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

# Apply the process function to the 'text' column of the DataFrame
# This will tokenize the text in the 'text' column
tokenized_text = df['text'].head().apply(process)

# Print the tokenized text
print(tokenized_text)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object


## convert the text into a matrix of token counts

In [9]:
# Create a CountVectorizer object with the analyzer set to the process function

message = CountVectorizer(analyzer=process).fit_transform(df['text'])

In [10]:
#split the data into 80% training and 20% testing
xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'], test_size=0.20, random_state=0)

# Print the shape of the data
# This will print the number of rows and columns in the message matrix
print(message.shape)

(5695, 37229)


In [11]:
# Create and train the Naive Bayes Classifier
classifier = MultinomialNB().fit(xtrain, ytrain)

# Print the predictions on the training data
print(classifier.predict(xtrain))

# Print the actual labels of the training data
print(ytrain.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


## See the classifiers prediction and actual values on the data set

In [12]:
# Evaluate the model on the training dataset
pred = classifier.predict(xtrain)

# Print the classification report, which includes precision, recall, F1-score, and support
print("Classification Report on Training Data:\n", classification_report(ytrain, pred))

# Print the confusion matrix
print("\nConfusion Matrix on Training Data:\n", confusion_matrix(ytrain, pred))

# Print the accuracy score
print("\nAccuracy on Training Data:", accuracy_score(ytrain, pred))

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix on Training Data:
 [[3445   12]
 [   1 1098]]

Accuracy on Training Data: 0.9971466198419666


In [13]:
#print the predictions
print(classifier.predict(xtest))

#print the actual values
print(ytest.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


## evaluate the model on the test data set

In [15]:
# Evaluate the model on the test dataset
pred = classifier.predict(xtest)

# Print the classification report, which includes precision, recall, F1-score, and support
print("Classification Report on Test Data:\n", classification_report(ytest, pred))

Classification Report on Test Data:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139



In [16]:
# Print the confusion matrix
print("\nConfusion Matrix on Test Data:\n", confusion_matrix(ytest, pred))

# Print the accuracy score
print("\nAccuracy on Test Data:", accuracy_score(ytest, pred))


Confusion Matrix on Test Data:
 [[862   8]
 [  1 268]]

Accuracy on Test Data: 0.9920983318700615
