In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from spellchecker import SpellChecker
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
from sklearn.pipeline import Pipeline
import joblib
from sklearn.ensemble import RandomForestClassifier
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\S H
[nltk_data]     K\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\S H
[nltk_data]     K\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\S H
[nltk_data]     K\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\S H
[nltk_data]     K\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Data Collection**

In [5]:
email_dataset = pd.read_csv("spam_ham_dataset.csv")

In [7]:
email_dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [9]:
email_dataset.drop(columns=["Unnamed: 0", "label"], inplace=True)

In [11]:
email_dataset.rename(columns={"text": "Email", "label_num": "Spam"}, inplace=True)

In [13]:
email_dataset.head()

Unnamed: 0,Email,Spam
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [15]:
email_dataset.head()

Unnamed: 0,Email,Spam
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [17]:
len(email_dataset)

5171

In [19]:
email_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Email   5171 non-null   object
 1   Spam    5171 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 80.9+ KB


# **Data Preprocessing**

In [22]:
spam_words = spam_keywords = [
    "100% more", "100% free", "100% satisfied", "Additional income", "Be your own boss", "Best price",
    "Big bucks", "Billion", "Cash bonus", "Cents on the dollar", "Consolidate debt", "Double your cash",
    "Double your income", "Earn extra cash", "Earn money", "Eliminate bad credit", "Extra cash",
    "Extra income", "Expect to earn", "Fast cash", "Financial freedom", "Free access", "Free consultation",
    "Free gift", "Free hosting", "Free info", "Free investment", "Free membership", "Free money",
    "Free preview", "Free quote", "Free trial", "Full refund", "Get out of debt", "Get paid", "Giveaway",
    "Guaranteed", "Increase sales", "Increase traffic", "Incredible deal", "Lower rates", "Lowest price",
    "Make money", "Million dollars", "Miracle", "Money back", "Once in a lifetime", "One time",
    "Pennies a day", "Potential earnings", "Prize", "Promise", "Pure profit", "Risk-free",
    "Satisfaction guaranteed", "Save big money", "Save up to", "Special promotion", "Act now", "Apply now",
    "Become a member", "Call now", "Click below", "Click here", "Get it now", "Do it today", "Don’t delete",
    "Exclusive deal", "Get started now", "Important information regarding", "Information you requested",
    "Instant", "Limited time", "New customers only", "Order now", "Please read", "See for yourself",
    "Sign up free", "Take action", "This won’t last", "Urgent", "What are you waiting for?",
    "While supplies last", "Will not believe your eyes", "Winner", "Winning", "You are a winner",
    "You have been selected", "Bulk email", "Buy direct", "Cancel at any time", "Check or money order",
    "Congratulations", "Confidentiality", "Cures", "Dear friend", "Direct email", "Direct marketing",
    "Hidden charges", "Human growth hormone", "Internet marketing", "Lose weight", "Mass email",
    "Meet singles", "Multi-level marketing", "No catch", "No cost", "No credit check", "No fees",
    "No gimmick", "No hidden costs", "No hidden fees", "No interest", "No investment", "No obligation",
    "No purchase necessary", "No questions asked", "No strings attached", "Not junk", "Notspam",
    "Obligation", "Passwords", "Requires initial investment", "Social security number",
    "This isn’t a scam", "This isn’t junk", "This isn’t spam", "Undisclosed", "Unsecured credit",
    "Unsecured debt", "Unsolicited", "Valium", "Viagra", "Vicodin", "We hate spam", "Weight loss", "Xanax",
    "Accept credit cards", "Ad", "All new", "As seen on", "Bargain", "Beneficiary", "Billing", "Bonus",
    "Cards accepted", "Cash", "Certified", "Cheap", "Claims", "Clearance", "Compare rates",
    "Credit card offers", "Deal", "Debt", "Discount", "Fantastic", "In accordance with laws", "Income",
    "Investment", "Join millions", "Lifetime", "Loans", "Luxury", "Marketing solution",
    "Message contains", "Mortgage rates", "Name brand", "Offer", "Online marketing", "Opt in",
    "Pre-approved", "Quote", "Rates", "Refinance", "Removal", "Reserves the right", "Score",
    "Search engine", "Sent in compliance", "Subject to…", "Terms and conditions", "Trial", "Unlimited",
    "Warranty", "Web traffic", "Work from home"
]


In [24]:
for i in range(len(spam_words)):
    spam_words[i] = spam_words[i].lower()

In [26]:
email_dataset["Email"] = email_dataset["Email"].str.lower()

In [28]:
email_dataset["URL"] = email_dataset["Email"].str.contains(r'https?://\S+|www\.\S+|\S+\.\S+\.\S+', regex=True, na=False).astype(int)

In [29]:
email_dataset["URL"].value_counts()

URL
0    5171
Name: count, dtype: int64

In [30]:
special_chars = ["\t", "\n", "\r", "\v", "\f", "\\", "\'", "\"", "\a", "\b", "\e"]

In [31]:
def remove_special_characters(email_msg):
    email_msg = ''.join(ch if (ch not in special_chars and ch not in string.punctuation) else ' ' for ch in email_msg)
    return ' '.join(email_msg.split())
    

In [36]:
email_dataset["Email"] = email_dataset["Email"].apply(remove_special_characters)

In [37]:
def remove_hyperlinks(email_msg):
    url_pattern = r'https?://\S+|www\.\S+|\S+\.\S+\.\S+'
    clean_text = re.sub(url_pattern, '', email_msg)
    return ' '.join(clean_text.split())

In [38]:
email_dataset["Email"] = email_dataset["Email"].apply(remove_hyperlinks)

In [39]:
spell = SpellChecker()

In [40]:
spell.word_frequency.load_text_file("english_dictionary.txt")

In [41]:
def count_spelling_mistakes(email_msg):
    words = email_msg.split()
    mistakes = spell.unknown(words)
    return len(mistakes)


In [42]:
email_dataset["Spelling Mistake"] = email_dataset["Email"].apply(count_spelling_mistakes)

In [43]:
email_dataset.head()

Unnamed: 0,Email,Spam,URL,Spelling Mistake
0,subject enron methanol meter 988291 this is a ...,0,0,0
1,subject hpl nom for january 9 2001 see attache...,0,0,2
2,subject neon retreat ho ho ho we re around to ...,0,0,4
3,subject photoshop windows office cheap main tr...,1,0,6
4,subject re indian springs this deal is to book...,0,0,3


In [46]:
def count_spam_words(email_msg):
    count = sum(1 for word in spam_words if re.search(rf"\b{re.escape(word)}\b", email_msg))
    return count

In [47]:
email_dataset["Spam Words"] = email_dataset["Email"].apply(count_spam_words)

In [50]:
email_dataset.head()

Unnamed: 0,Email,Spam,URL,Spelling Mistake,Spam Words
0,subject enron methanol meter 988291 this is a ...,0,0,0,0
1,subject hpl nom for january 9 2001 see attache...,0,0,2,0
2,subject neon retreat ho ho ho we re around to ...,0,0,4,0
3,subject photoshop windows office cheap main tr...,1,0,6,1
4,subject re indian springs this deal is to book...,0,0,3,1


In [53]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(email_msg):
    if isinstance(email_msg, str):
        return " ".join([lemmatizer.lemmatize(word) for word in email_msg.split()])
    return text

In [54]:
email_dataset["Email"] = email_dataset["Email"].apply(lemmatize_text)

In [58]:
email_dataset.head()

Unnamed: 0,Email,Spam,URL,Spelling Mistake,Spam Words
0,subject enron methanol meter 988291 this is a ...,0,0,0,0
1,subject hpl nom for january 9 2001 see attache...,0,0,2,0
2,subject neon retreat ho ho ho we re around to ...,0,0,4,0
3,subject photoshop window office cheap main tre...,1,0,6,1
4,subject re indian spring this deal is to book ...,0,0,3,1


In [59]:
stop_words = set(stopwords.words("english"))

def remove_stopwords(email_msg):
    if isinstance(email_msg, str):
        return " ".join([word for word in email_msg.split() if word not in stop_words])
    return text

In [60]:
email_dataset["Email"] = email_dataset["Email"].apply(remove_stopwords)

# **Model Training and Evaluation**

In [69]:
email_dataset["Tokenized_Email"] = email_dataset["Email"].apply(word_tokenize)

In [70]:
email_dataset.head()

Unnamed: 0,Email,Spam,URL,Spelling Mistake,Spam Words,Tokenized_Email
0,subject enron methanol meter 988291 follow not...,0,0,0,0,"[subject, enron, methanol, meter, 988291, foll..."
1,subject hpl nom january 9 2001 see attached fi...,0,0,2,0,"[subject, hpl, nom, january, 9, 2001, see, att..."
2,subject neon retreat ho ho ho around wonderful...,0,0,4,0,"[subject, neon, retreat, ho, ho, ho, around, w..."
3,subject photoshop window office cheap main tre...,1,0,6,1,"[subject, photoshop, window, office, cheap, ma..."
4,subject indian spring deal book teco pvr reven...,0,0,3,1,"[subject, indian, spring, deal, book, teco, pv..."


In [71]:
w2v_model = Word2Vec(sentences=email_dataset["Tokenized_Email"], vector_size=300, window=5, min_count=2, workers=4)

In [72]:
word_vectors = w2v_model.wv

In [73]:
def get_email_vector(email_msg):
    words = word_tokenize(email_msg)
    word_vecs = [word_vectors[word] for word in words if word in word_vectors]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(100)

In [74]:
email_dataset["Email_Vector"] = email_dataset["Email"].apply(get_email_vector)

In [75]:
email_dataset.head()

Unnamed: 0,Email,Spam,URL,Spelling Mistake,Spam Words,Tokenized_Email,Email_Vector
0,subject enron methanol meter 988291 follow not...,0,0,0,0,"[subject, enron, methanol, meter, 988291, foll...","[0.009583299, 0.31352717, -0.39792824, -0.1951..."
1,subject hpl nom january 9 2001 see attached fi...,0,0,2,0,"[subject, hpl, nom, january, 9, 2001, see, att...","[-0.44266564, 0.4444116, -0.32168856, 0.405939..."
2,subject neon retreat ho ho ho around wonderful...,0,0,4,0,"[subject, neon, retreat, ho, ho, ho, around, w...","[-0.09661402, 0.20648262, -0.02235691, -0.0803..."
3,subject photoshop window office cheap main tre...,1,0,6,1,"[subject, photoshop, window, office, cheap, ma...","[0.019383883, 0.060368568, 0.053865988, 0.0294..."
4,subject indian spring deal book teco pvr reven...,0,0,3,1,"[subject, indian, spring, deal, book, teco, pv...","[-0.033463363, 0.2360083, -0.2184775, -0.17886..."


In [76]:
email_dataset["Email_Vector"].iloc[0]

array([ 0.0095833 ,  0.31352717, -0.39792824, -0.19511938,  0.41276166,
       -0.2889797 ,  0.33318615,  0.7951047 ,  0.3319862 , -0.1555453 ,
        0.18518488, -0.30883843,  0.18163106, -0.17324704, -0.2549048 ,
       -0.08500267,  0.09471945, -0.09840708, -0.07034989, -0.03501974,
       -0.1373469 , -0.34116346, -0.1381763 , -0.12146737,  0.38180113,
       -0.14877991, -0.19221927, -0.13339092, -0.18081123, -0.11899041,
        0.38627544, -0.24100685,  0.09287103, -0.14252694,  0.19600525,
        0.02888428,  0.00136885,  0.02733692,  0.0775645 ,  0.04176567,
       -0.11778327, -0.04574978,  0.01549056, -0.14071207,  0.36981505,
        0.29220247,  0.4059696 ,  0.3130415 ,  0.36824062,  0.54591274,
        0.38639328,  0.09322078, -0.14190361,  0.1330455 , -0.03950164,
        0.62007064, -0.17392385,  0.33887017,  0.1916046 , -0.08643954,
        0.17757772, -0.06928024, -0.1862529 ,  0.36438325, -0.01786491,
        0.1004214 ,  0.14148478,  0.11025131, -0.30563518,  0.02

In [77]:
email_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Email             5171 non-null   object
 1   Spam              5171 non-null   int64 
 2   URL               5171 non-null   int32 
 3   Spelling Mistake  5171 non-null   int64 
 4   Spam Words        5171 non-null   int64 
 5   Tokenized_Email   5171 non-null   object
 6   Email_Vector      5171 non-null   object
dtypes: int32(1), int64(3), object(3)
memory usage: 262.7+ KB


In [78]:
scaler = MinMaxScaler()
email_dataset[['Spam Words', 'Spelling Mistake']] = scaler.fit_transform(email_dataset[['Spam Words', 'Spelling Mistake']])

In [79]:
email_dataset.head()

Unnamed: 0,Email,Spam,URL,Spelling Mistake,Spam Words,Tokenized_Email,Email_Vector
0,subject enron methanol meter 988291 follow not...,0,0,0.0,0.0,"[subject, enron, methanol, meter, 988291, foll...","[0.009583299, 0.31352717, -0.39792824, -0.1951..."
1,subject hpl nom january 9 2001 see attached fi...,0,0,0.001887,0.0,"[subject, hpl, nom, january, 9, 2001, see, att...","[-0.44266564, 0.4444116, -0.32168856, 0.405939..."
2,subject neon retreat ho ho ho around wonderful...,0,0,0.003774,0.0,"[subject, neon, retreat, ho, ho, ho, around, w...","[-0.09661402, 0.20648262, -0.02235691, -0.0803..."
3,subject photoshop window office cheap main tre...,1,0,0.00566,0.1,"[subject, photoshop, window, office, cheap, ma...","[0.019383883, 0.060368568, 0.053865988, 0.0294..."
4,subject indian spring deal book teco pvr reven...,0,0,0.00283,0.1,"[subject, indian, spring, deal, book, teco, pv...","[-0.033463363, 0.2360083, -0.2184775, -0.17886..."


In [80]:
data_inputs = email_dataset[["URL", "Spelling Mistake", "Spam Words", "Email_Vector"]]

In [81]:
X_word2vec = np.array(data_inputs["Email_Vector"].tolist(), dtype=np.float64)

In [82]:
X_numerical = data_inputs[["URL", "Spelling Mistake", "Spam Words"]].values

In [85]:
X_combined = np.hstack((X_word2vec, X_numerical))

In [86]:
target_column = email_dataset["Spam"]

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, target_column, test_size=0.2, random_state=42)

In [88]:
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

In [89]:
y_pred = gnb_classifier.predict(X_test)

In [90]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8801932367149758
              precision    recall  f1-score   support

           0       0.97      0.86      0.91       742
           1       0.72      0.93      0.81       293

    accuracy                           0.88      1035
   macro avg       0.85      0.90      0.86      1035
weighted avg       0.90      0.88      0.88      1035



In [91]:
bernoulli_classifier = BernoulliNB()
bernoulli_classifier.fit(X_train, y_train)

In [92]:
y_pred2 = bernoulli_classifier.predict(X_test)

In [93]:
print("Accuracy:", accuracy_score(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

Accuracy: 0.9188405797101449
              precision    recall  f1-score   support

           0       0.97      0.91      0.94       742
           1       0.81      0.93      0.87       293

    accuracy                           0.92      1035
   macro avg       0.89      0.92      0.90      1035
weighted avg       0.93      0.92      0.92      1035



In [94]:
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0], 
    'binarize': [0.0, 0.2, 0.5, 1.0, -1.0], 
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(BernoulliNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'alpha': 0.01, 'binarize': 0.0, 'fit_prior': False}


60 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Anaconda\Lib\site-packages\sklearn\naive_bayes.py", line 748, in fit
    self._validate_params()
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "E:\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParamete

In [95]:
best_bnb = BernoulliNB(alpha=0.01, binarize=0.0, fit_prior=True)
best_bnb.fit(X_train, y_train)
y_pred3 = best_bnb.predict(X_test)

In [96]:
print("Accuracy:", accuracy_score(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

Accuracy: 0.9227053140096618
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       742
           1       0.82      0.93      0.87       293

    accuracy                           0.92      1035
   macro avg       0.90      0.93      0.91      1035
weighted avg       0.93      0.92      0.92      1035



In [97]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(email_dataset["Email"])

In [98]:
X_numeric = email_dataset[["URL", "Spelling Mistake", "Spam Words"]].values
X = sp.hstack((X_tfidf, X_numeric), format="csr")

In [102]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, target_column, test_size=0.2, random_state=42)

In [103]:
bnb_tfidf = BernoulliNB(alpha=0.01, binarize=0.0, fit_prior=True)
bnb_tfidf.fit(X_train2, y_train2)

In [104]:
y_pred4 = bnb_tfidf.predict(X_test2)

print("Accuracy:", accuracy_score(y_test2, y_pred4))
print(classification_report(y_test2, y_pred4))

Accuracy: 0.9410628019323671
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       742
           1       0.91      0.88      0.89       293

    accuracy                           0.94      1035
   macro avg       0.93      0.92      0.93      1035
weighted avg       0.94      0.94      0.94      1035



In [105]:
multinomial_classifier = MultinomialNB()
multinomial_classifier.fit(X_train2, y_train2)

In [106]:
y_pred5 = multinomial_classifier.predict(X_test2)

print("Accuracy:", accuracy_score(y_test2, y_pred5))
print(classification_report(y_test2, y_pred5))

Accuracy: 0.9516908212560387
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       742
           1       0.90      0.93      0.92       293

    accuracy                           0.95      1035
   macro avg       0.94      0.95      0.94      1035
weighted avg       0.95      0.95      0.95      1035



In [107]:
multinomial_classifier = MultinomialNB(alpha=0.0001)
multinomial_classifier.fit(X_train2, y_train2)
y_pred6 = multinomial_classifier.predict(X_test2)

print("Accuracy:", accuracy_score(y_test2, y_pred6))
print(classification_report(y_test2, y_pred6))

Accuracy: 0.9623188405797102
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       742
           1       0.93      0.94      0.93       293

    accuracy                           0.96      1035
   macro avg       0.95      0.96      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [139]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train2, y_train2)
y_pred7 = rf_classifier.predict(X_test2)

print("Accuracy:", accuracy_score(y_test2, y_pred7))
print(classification_report(y_test2, y_pred7))

Accuracy: 0.9806763285024155
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       742
           1       0.96      0.98      0.97       293

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035



In [143]:
rf_classifier2 = RandomForestClassifier()
rf_classifier2.fit(X_train, y_train)
y_pred8 = rf_classifier2.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred8))
print(classification_report(y_test, y_pred8))

Accuracy: 0.9690821256038648
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       742
           1       0.94      0.96      0.95       293

    accuracy                           0.97      1035
   macro avg       0.96      0.97      0.96      1035
weighted avg       0.97      0.97      0.97      1035



In [145]:
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 10, 20],     
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4],  
    'criterion': ['gini', 'entropy'] 
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train2, y_train2)

print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [149]:
best_rf = RandomForestClassifier(criterion="entropy", max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200)
best_rf.fit(X_train2, y_train2)
y_pred9 = best_rf.predict(X_test2)

print("Accuracy:", accuracy_score(y_test2, y_pred9))
print(classification_report(y_test2, y_pred9))

Accuracy: 0.9835748792270531
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       742
           1       0.96      0.99      0.97       293

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035



# **Saving the Best Model**

In [151]:
joblib.dump(best_rf, 'spam_classifier.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_spam_classifier.pkl')
joblib.dump(scaler, 'spam_classifier_scaler.pkl')

['spam_classifier_scaler.pkl']

In [153]:
joblib.dump(spell, 'spam_classifier_spell_checker.pkl')
joblib.dump(lemmatizer, 'spam_classifier_lemmatizer.pkl')

['spam_classifier_lemmatizer.pkl']

# **Predicting Values**

In [205]:
def predictSpam(email):
    email = email.lower()
    URL = int(bool(re.search(r'https?://\S+|www\.\S+|\S+\.\S+\.\S+', email)))
    email = remove_special_characters(email)
    email = remove_hyperlinks(email)
    Spelling_Mistake = count_spelling_mistakes(email)
    Spam_Words = count_spam_words(email)
    email = lemmatize_text(email)
    email = remove_stopwords(email)
    scaled_values = scaler.transform(np.array([[Spam_Words, Spelling_Mistake]])) 
    Spam_Words, Spelling_Mistake = scaled_values[0]
    TFIDF = tfidf_vectorizer.transform([email])
    INPUT = sp.hstack([TFIDF, np.array([[URL, Spelling_Mistake, Spam_Words]])], format='csr')
    pred = best_rf.predict(INPUT)

    if pred[0]:
        print("Model Predicts Spam")
    else:
        print("Model Predicts Not Spam")

In [207]:
test_email_msg = """Subject: Urgent Notification: You Have Won $1,000,000 

Dear User,

We are pleased to inform you that you have been selected as the winner of our $1,000,000 grand prize. This reward has been allocated to you as part of our annual customer appreciation program.

To claim your prize, please follow these steps immediately:

Click on the secure link below to verify your details:
Claim Your Prize Now

Complete the verification form with your full name, address, and banking details.

Receive your funds within 24 hours.

This is a limited-time offer, and failure to respond within the next 24 hours will result in the forfeiture of your winnings.

For any questions, contact our support team at support@globalwinners.com.

Sincerely,
The Global Lottery Team
Customer Support Department
+1-800-FAKE-NUMBER" """

In [209]:
predictSpam(test_email_msg)

Model Predicts Spam




In [211]:
test_email_msg_2 = """Subject: Meeting Reminder for Project Discussion  

Hi John,  

I hope you're doing well. This is a reminder about our upcoming meeting scheduled for **Monday, April 1st, at 3:00 PM**. We'll be discussing the progress of the project and the next steps.  

Please let me know if you're available at this time or if we need to reschedule. Looking forward to your input.  

Best regards,  
Sameer Khawar  
Project Manager  
sameer.khawar@example.com   """

In [213]:
predictSpam(test_email_msg_2)

Model Predicts Not Spam


