# Machine Learning and Deep Mining Spam Email Detection Project

SOFE 4620U - Machine Learning & Data Mining Final Project

---

### Problem Statement

The problem we wish to address is the filtering of spam emails from one's personal email address. Spam is a common issue for most individuals as various sites leak user’s email information to nefarious companies who then “spam” the users email with repetitive and often inappropriate advertisements. Although solutions for this problem already exist, we wish to build an email filtering system with categorization of spam emails to further our understanding of natural language processing and machine learning as a whole. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk
import re
import enchant
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer



In [None]:
df = pd.read_csv("spam_nospam.csv", header=None)
df[:]

## Dataset information and statistics
---

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df = df.rename(columns={0:"Type", 1:"Message", 2:"Filename"})
df = df.drop(columns={"Filename"})
df = df.drop(0)
df[:]

In [None]:
df["Length"] = df["Message"].apply(len)
df[:]

In [None]:
plt.figure(figsize = (10, 5))
x = df.Type.value_counts()
sns.countplot(x = "Type", data = df)

## Cleaning the dataset
---

In [None]:
df[:]

In [None]:
html_content_tags = ["html", "body", "head", "footer", "b", "br", "font", "http", "com", "www"]

In [None]:
# Removing punctuation
df["Message"] = df["Message"].str.replace("\W", " ", regex=True)

# Lower casing all letters
df["Message"] = df["Message"].str.lower()

df[:]

In [None]:
#non_spam = df[df.Type == 0]
#spam = df[df.Type == 1]

#non_spam_text = " ".join(non_spam.Message.to_numpy().tolist())
#spam_text = " ".join(spam.Message.to_numpy().tolist())

non_spam_amount = []
spam_amount = []
counter = 0

# Add spam and non-spam messages to their own array list

for i in df["Type"]:
    if (i == "0"):
        non_spam_amount.append(df["Message"].iloc[counter])
    elif (i == "1"):
        spam_amount.append(df["Message"].iloc[counter])
    counter = counter + 1

### Cleaning Non-Spam Words
---

In [None]:
# Split each non-spam message into individual words and add them to an array list

for i in non_spam_amount:
    split_words_nonspam = i.split()

In [None]:
# Array list that contains cleaned non-spam words
word_array_nonspam = []
j = 0

# Further clean words within non-spam array by removing stop words and html tag related words

while j <= len(split_words_nonspam):
    for words in split_words_nonspam:
        
        # Remove stop words such as "a", "the", ...
        if words.lower() in stopwords.words("english"):
            split_words_nonspam.remove(words)
        
        # Remove html tag words such as "html", "br", ...
        if words in html_content_tags or len(words) <= 1:
            continue
        
        #d = enchant.Dict("en_US")
        #if d.check(words) == False:
            #continue
            
        word_array_nonspam.append(words)
    j = j + 1
    
# Utilize counter function to determine most common non-spam words
word_set_nonspam = Counter(word_array_nonspam)
frequent_words_nonspam = word_set_nonspam.most_common(15)
print(frequent_words_nonspam)

In [None]:
# Contains cleaned non-spam words
word_array_nonspam

In [None]:
# Display most occurring non-spam words
nonspam_amount = pd.DataFrame(frequent_words_nonspam)
nonspam_amount = nonspam_amount.rename(columns={0:"Words", 1:"Occurrences"})
nonspam_amount

### Cleaning Spam Words
---

In [None]:
# Split each spam message into individual words and add them to an array list
for i in spam_amount:
    split_words_spam = i.split()

In [None]:
# Array list that contains cleaned spam words
word_array_spam = []
k = 0

# Further clean words within spam array by removing stop words and html tag related words

while k <= len(split_words_spam):
    for words in split_words_spam:
        
        # Remove stop words such as "a", "the", ...
        if words.lower() in stopwords.words("english"):
            split_words_spam.remove(words)
            
        # Remove html tag words such as "html", "br", ...
        if words in html_content_tags or len(words) <= 1:
            continue
        
        #d = enchant.Dict("en_US")
        #if d.check(words) == False:
            #continue
            
        word_array_spam.append(words)
    k = k + 1

# Utilize counter function to determine most common spam words
word_set_spam = Counter(word_array_spam)
frequent_words_spam = word_set_spam.most_common(15)
print(frequent_words_spam)

In [None]:
# Contains cleaned spam words
word_array_spam

In [None]:
# Display most occurring spam words
spam_amount = pd.DataFrame(frequent_words_spam)
spam_amount = spam_amount.rename(columns={0:"Words", 1:"Occurrences"})
spam_amount

In [None]:
len(word_array_spam)

In [None]:
len(word_array_nonspam)

### Clean and Update Messages within Table
---

In [None]:
counter_message = 1
max_message_amount = len(df["Message"])

# Clean and update the messages within the table

for message in df["Message"]:
    # Split the message into an array of words
    message_new = message.split()
    
    # Cycle through the array of words
    for words in message_new:
        
        # Remove stop words such as "a", "the", ...
        if words in stopwords.words("english"):
            message_new.remove(words)

    # Update the message in the table with the new message that contains no stop words
    df.loc[counter_message, "Message"] = ' '.join(message_new)
    message_new.clear()
    counter_message = counter_message + 1
    
    # If reached end of the table
    if (counter_message == max_message_amount):
        break

# Update the length of each message
df["Length"] = df["Message"].str.len()

# Save to csv 
df[:].to_csv('cleaned_data.csv', index=False)
df[:]

## Most common words with wordcloud
---

In [None]:
spam_words = " ".join(list(df[df["Type"] == "1"]["Message"]))
spam_wordcloud = WordCloud(width = 512, height = 512).generate(spam_words)
plt.figure(figsize = (10, 8), facecolor="k")
plt.imshow(spam_wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

In [None]:
non_spam_words = " ".join(list(df[df["Type"] == "0"]["Message"]))
non_spam_wordcloud = WordCloud(width = 512, height = 512).generate(non_spam_words)
plt.figure(figsize = (10, 8), facecolor="k")
plt.imshow(non_spam_wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

## Training and Testing Different Models 
---

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [2]:
cleaned_data = pd.read_csv('cleaned_data.csv')
cleaned_data[:]

Unnamed: 0,Type,Message,Length
0,1,dear homeowner interest rates at lowest point ...,447
1,1,attention is must computer users new special p...,855
2,1,multi part message mime format _nextpart_000_1...,4479
3,1,important information new domain names finally...,882
4,1,bottom line give away cd free people like 80 1...,1267
...,...,...,...
5791,0,m one the 30 000 it working well week the tes ...,693
5792,0,damien morton quoted w3c approves html 4 emoti...,199
5793,0,mon 2002 07 22 06 50 che wrote thats correct l...,322
5794,0,upon time manfred wrote would like install rpm...,1012


In [3]:
X = cleaned_data['Message']  # message column
y = cleaned_data['Type']  # type column (spam/ham)

# split the data into training and testing sets with a 80:20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# check the shapes of the resulting vectors
print("X_train shape:", X_train_vectorized.shape)
print("X_test shape:", X_test_vectorized.shape)

X_train shape: (4636, 93895)
X_test shape: (1160, 93895)


Naive Bayes

In [4]:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train_vectorized, y_train)

# Predict the class of new emails
y_pred = nb.predict(X_test_vectorized)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.925
Precision: 0.987460815047022
Recall: 0.7914572864321608
F1: 0.8786610878661089


Support Vector Machines | SVM

In [5]:
from sklearn.svm import SVC

# Train an SVM classifier
svm = SVC(kernel='linear')
svm.fit(X_train_vectorized, y_train)

# Predict the class of new emails
y_pred = svm.predict(X_test_vectorized)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)


Accuracy: 0.9922413793103448
Precision: 0.9899244332493703
Recall: 0.9874371859296482
F1: 0.988679245283019


Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_vectorized, y_train)

# Predict the class of new emails
y_pred = rf.predict(X_test_vectorized)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9801724137931035
Precision: 0.9844961240310077
Recall: 0.957286432160804
F1: 0.9707006369426752


K-Nearest Neighbors | KNN

In [7]:
from sklearn.neighbors import KNeighborsClassifier

# Train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_vectorized, y_train)

# Predict the class of new emails
y_pred = knn.predict(X_test_vectorized)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.971551724137931
Precision: 0.9573934837092731
Recall: 0.9597989949748744
F1: 0.958594730238394


Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression classifier
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train_vectorized, y_train)

# Predict the class of new emails
y_pred = lr.predict(X_test_vectorized)

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9827586206896551
Precision: 0.9846153846153847
Recall: 0.964824120603015
F1: 0.9746192893401016
