## Implementing a Naive Bayes classifier in Python using sci-kit learn to classify emails as spam or non-spam based on their content.

In [1]:
import numpy as np #linear algebra
import pandas as pd #data processing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv(r"C:\\Users\\admin\\Desktop\\tycs 14\\DW\\spam_ham_dataset.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0
5170,4807,spam,Subject: important online banking alert\r\ndea...,1


In [5]:
s = df["label"].value_counts()
s

ham     3672
spam    1499
Name: label, dtype: int64

In [6]:
#Dropping columns that are not needed
df = df.drop('Unnamed: 0',axis=1)
df = df.drop('label',axis=1)
df

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...
5166,Subject: put the 10 on the ft\r\nthe transport...,0
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,Subject: industrial worksheets for august 2000...,0


In [7]:
df.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [8]:
#Creating a new feature, extracting subject of each email
subjects=[]
for i in range(len(df)):
    ln = df['text'][i]
    line = ""
    for i in ln:
        if(i=='\r'):
            break
        line = line + i
    line = line.replace("Subject","")
    subjects.append(line)

In [9]:
df['Subject']=subjects

In [10]:
#Renaming the dataframe columns
df.columns=["Email_text","Label","Email_Subject"]

In [11]:
df.head()

Unnamed: 0,Email_text,Label,Email_Subject
0,Subject: enron methanol ; meter # : 988291\r\n...,0,: enron methanol ; meter # : 988291
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,": hpl nom for january 9 , 2001"
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,: neon retreat
3,"Subject: photoshop , windows , office . cheap ...",1,": photoshop , windows , office . cheap . main ..."
4,Subject: re : indian springs\r\nthis deal is t...,0,: re : indian springs


In [12]:
#Split Email dataset
X_train,X_test,y_train,y_test = train_test_split(df['Email_text'], df['Label'],test_size=0.1)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

#vectorizing our training dataset
Vectorizer=CountVectorizer()
count=Vectorizer.fit_transform(X_train.values)

In [14]:
#Fit the dataset on multinomial naive bayes
Spam_Detection= MultinomialNB()
targets = y_train.values
Spam_Detection.fit(count,targets)

In [15]:
#make predictions
y_predict = Spam_Detection.predict(Vectorizer.transform(X_test))

In [16]:
accuracy_score(y_test,y_predict)

0.971042471042471

In [17]:
#print classification report
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       365
           1       0.97      0.93      0.95       153

    accuracy                           0.97       518
   macro avg       0.97      0.96      0.96       518
weighted avg       0.97      0.97      0.97       518



In [19]:
#test our model for emails not in our dataset
Email = ['Are you experienced Python Developer seeking a high-high- paying job? Your dream job awaits you!! We’ve received your resume and would love to set up an online interview. Click here [Link] or call us at [Phone Number] at your earliest convenience.']
y_predict = Spam_Detection.predict(Vectorizer.transform(Email))
print(y_predict)

[1]


In [20]:
#test our model for emails not in our dataset (non spam email)
Email = ['All the students are hereby informed that your examination will start from 21st march,2024.']
y_predict = Spam_Detection.predict(Vectorizer.transform(Email))
print(y_predict)

[0]
