# Naive Bayes Classifier for spam email

https://www.kaggle.com/datasets/jackksoncsie/spam-email-dataset/

In [131]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### Load data

In [132]:
# Download latest version
path = kagglehub.dataset_download("jackksoncsie/spam-email-dataset")

#print("Path to dataset files:", path)



In [133]:
# Load dataset
# Replace 'your_file.csv' with the actual path to your CSV file
df = pd.read_csv(path+'/emails.csv')

# Display the first few rows of the DataFrame
print(df.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


### Exploratory Data Analysis

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [135]:
df.describe()

Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [136]:
#Check for missing data
df.isnull().sum()

text    0
spam    0
dtype: int64

In [143]:
#Separate data into a train and test set to validate the accuracy of our model
X=df['text']
y=df['spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [138]:
vectorizer = CountVectorizer(stop_words='english')
#get document term matrix DTM
X_train_dtm = vectorizer.fit_transform(X_train);
X_test_dtm = vectorizer.transform(X_test);  #only transform on unseen test data
X_train_dtm.shape, X_test_dtm.shape
#creates compressed sparse row (CSR) matrix, for efficient storage.

((4296, 32973), (1432, 32973))

### Train a Naive Bayes model

In [139]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred = nb.predict(X_test_dtm)

### Evaluate accuracy

In [140]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)


Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.99      0.99      1090
        True       0.97      0.99      0.98       342

    accuracy                           0.99      1432
   macro avg       0.99      0.99      0.99      1432
weighted avg       0.99      0.99      0.99      1432



0.9909217877094972

In [141]:
#Demonstrate out of sample testing
#Select 10 true and 10 false and show predictions
sampleix = y_test[y_test==True][:10].index
sampleix = sampleix.append(y_test[y_test==False][:10].index)

#X_test.loc[sampleix]
#y_test.loc[sampleix]
X_sample_test_dtm = vectorizer.transform(X_test.loc[sampleix])

#make predictions on these 20
y_sample_pred=nb.predict(X_sample_test_dtm)

#Display as dataframe
df=pd.DataFrame(X_test.loc[sampleix]).assign(spam=y_test.loc[sampleix]).assign(pred=y_sample_pred)
df

Unnamed: 0,text,spam,pred
587,Subject: porn p . o . : your 10 free pictures ...,True,True
156,Subject: do i require an attorney to use this ...,True,True
1045,"Subject: all graphics software available , che...",True,True
529,Subject: hi how to save on customer your medl...,True,True
1234,Subject: want to make women adore you ? click ...,True,True
1211,Subject: you can gain from lowest interest rat...,True,True
209,Subject: the next move higher for strong marke...,True,True
571,Subject: adv oil and gas investment tgym how ...,True,True
443,Subject: more site sales do you take credit c...,True,True
1204,Subject: better sex ! better relationship ! a...,True,True


### Demonstrate generalization of model

In [142]:
t=['Dear Mr. Kenobi, Subject: Help, you''re my only hope!  Best Regards, Princess Laia',
   'buy viagra from our online pharmacy.',
   'Dear customer, Try our luxurious body lotions today, risk free!',
   'You may regret missing this promo!',
   'Subject: Mr. Levenstein, Your statement is ready. View your first statement now.  Sign in at onlinebank.com.  Thank you for being a valued customer.',
   'Hey there, I just wanted to touch base about the upcoming meeting. Give me a call at 123-456-7890',
  ]
t_dtm=vectorizer.transform(t)
nb.predict(t_dtm)

array([False,  True,  True,  True,  True, False])