This is the model made using Multinomial Naive Bayes Classifier and the data is pre-processed in this case. 


In [30]:
# IMPORT PACKAGES
import pandas as pd   # reads the csv file 
import numpy as np   # array related functions
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

In [31]:
# nltk.download('punkt')
# nltk.download('stopwords')

In [32]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation and special characters
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Perform stemming
    stemmer = PorterStemmer() # changes the word to the root word
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [33]:
# import dataset
spam_df = pd.read_csv("spam.csv")
spam_df

Unnamed: 0,Category,Message
0,spam,Congratulations! You've been selected as the w...
1,ham,"Hey there, just wanted to remind you about our..."
2,spam,URGENT: Your bank account security has been co...
3,ham,Hi! Attached is the agenda for our team's offs...
4,spam,Hurry! Limited time offer - get 50% off all pu...
...,...,...
5867,spam,This is the 2nd time we have tried 2 contact u...
5868,ham,Will ü b going to esplanade fr home?
5869,ham,"Pity, * was in mood for that. So...any other s..."
5870,ham,The guy did some bitching but I acted like i'd...


In [34]:
# preprocess the data
spam_df['Preprocessed_Message'] = spam_df['Message'].apply(preprocess_text)
spam_df

Unnamed: 0,Category,Message,Preprocessed_Message
0,spam,Congratulations! You've been selected as the w...,congratul 've select winner exclus prize draw ...
1,ham,"Hey there, just wanted to remind you about our...",hey want remind meet tomorrow 10 n't forget br...
2,spam,URGENT: Your bank account security has been co...,urgent bank account secur compromis pleas log ...
3,ham,Hi! Attached is the agenda for our team's offs...,hi attach agenda team 's offsit retreat next m...
4,spam,Hurry! Limited time offer - get 50% off all pu...,hurri limit time offer get 50 purchas special ...
...,...,...,...
5867,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u. u £750 pound prize 2...
5868,ham,Will ü b going to esplanade fr home?,ü b go esplanad fr home
5869,ham,"Pity, * was in mood for that. So...any other s...",piti mood ... suggest
5870,ham,The guy did some bitching but I acted like i'd...,guy bitch act like 'd interest buy someth els ...


In [35]:
# inspect data 
# spam_df = pd.read_csv("spam.csv")
spam_df.groupby('Category').describe()

# duplicates can be checked

Unnamed: 0_level_0,Message,Message,Message,Message,Preprocessed_Message,Preprocessed_Message,Preprocessed_Message,Preprocessed_Message
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4974,4557,"Sorry, I'll call later",30,4974,4533,sorri 'll call later,30
spam,898,682,Congratulations! You're our 1000th visitor tod...,11,898,672,n't miss flash sale get 70 select item shop ht...,11


In [36]:
# turn spam/ham into numerical data, creating a new column called 'spam'
spam_df['spam'] = spam_df['Category'].apply(lambda x:1 if x == 'spam' else 0)  # values are being checked from the 'Category' column and accordingly 1/0 is alotted to the values
spam_df

Unnamed: 0,Category,Message,Preprocessed_Message,spam
0,spam,Congratulations! You've been selected as the w...,congratul 've select winner exclus prize draw ...,1
1,ham,"Hey there, just wanted to remind you about our...",hey want remind meet tomorrow 10 n't forget br...,0
2,spam,URGENT: Your bank account security has been co...,urgent bank account secur compromis pleas log ...,1
3,ham,Hi! Attached is the agenda for our team's offs...,hi attach agenda team 's offsit retreat next m...,0
4,spam,Hurry! Limited time offer - get 50% off all pu...,hurri limit time offer get 50 purchas special ...,1
...,...,...,...,...
5867,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u. u £750 pound prize 2...,1
5868,ham,Will ü b going to esplanade fr home?,ü b go esplanad fr home,0
5869,ham,"Pity, * was in mood for that. So...any other s...",piti mood ... suggest,0
5870,ham,The guy did some bitching but I acted like i'd...,guy bitch act like 'd interest buy someth els ...,0


values are being checked from the 'Category' column and accordingly 1/0 is alotted to the values


1-> spam

0-> ham

In [37]:
# create train/test split 
x_train, x_test, y_train, y_test = train_test_split(spam_df.Preprocessed_Message, spam_df.spam)
# x_train, x_test, y_train, y_test = train_test_split(spam_df['Preprocessed_Message'], spam_df['spam'])

In [38]:
x_train.describe()

count                     4404
unique                    3978
top       sorri 'll call later
freq                        23
Name: Preprocessed_Message, dtype: object

75% - 25% split in training data and testing data 

In [39]:
# find word count and store data as a matrix 
cv = CountVectorizer(preprocessor=preprocess_text)
x_train_count = cv.fit_transform(x_train.values)

important component  (used Count vectorizer)

In [40]:
x_train_count.toarray()  # numpy used / CV used for understanding the data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [41]:
# training the model
model = MultinomialNB()
model.fit(x_train_count, y_train) # cv 
# model.fit(x_train_tfidf, y_train) # tf-idf

In [44]:
# pre-test spam/ ham
# email_spam = ["Dear students, given below are the details for your upcoming Btest"]
email_spam = [input("Enter the email message")]
# email_spam = [input("Enter the email here!")]
email_spam_count = cv.transform(email_spam)   # pandas used
model.predict(email_spam_count)

if model.predict(email_spam_count) == 0: ###
    print("Given email is not spam")
elif model.predict(email_spam_count) == 1:
    print("Given email is spam")

Given email is spam


Here, 9 emails were used as test cases. This particular version of the model got 2 wrong. i.e. test cases 5 & 6

TEST CASES:

Test Case 1: Ham Email

Input: "Hey, how are you doing?"
Expected Output: "Given email is not spam"
Test Case 2: Spam Email

Input: "Click here to claim your prize!"
Expected Output: "Given email is spam"
Test Case 3: Neutral Email

Input: "Meeting reminder: Tomorrow at 10 AM"
Expected Output: "Given email is not spam"
Test Case 4: Empty Email

Input: ""
Expected Output: "Given email is not spam" (assuming an empty email is considered not spam)
Test Case 5: Email with Numbers

Input: "Buy now! Offer ends in 3 days."
Expected Output: "Given email is spam" (assuming emails with numbers are often spam)
Test Case 6: Email with Special Characters

Input: "𝑆𝑝𝑒𝑐𝑖𝑎𝑙 𝐷𝑖𝑠𝑐𝑜𝑢𝑛𝑡
SpecialDiscount"
Expected Output: "Given email is spam" (assuming emails with excessive special characters are spam)
Test Case 7: Email with Mixed Case

Input: "cLiCk HeRe To ClAiM YoUr PrIzE!"
Expected Output: "Given email is spam" (assuming emails with mixed case are often spam)
Test Case 8: Email with URLs

Input: "Visit our website at www.example.com"
Expected Output: "Given email is spam" (assuming emails with URLs are often spam)
Test Case 9: Email with Uncommon Words

Input: "Supercalifragilisticexpialidocious"
Expected Output: "Given email is not spam" (assuming uncommon words are not indicative of spam)





In [43]:
# test model  (CV)
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)
Accuracy = model.score(x_test_count, y_test) * 100 
print(f"The current accuracy of the model is {Accuracy}%")

The current accuracy of the model is 97.95640326975476%
