In [55]:
import pandas as pd
import re
import pickle
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [56]:
data=pd.read_csv("Email Spam Detection.csv",encoding='latin-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [57]:
#Data Cleaning 
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
data.drop_duplicates(inplace=True)
data['v2']=data['v2'].apply(lambda x:x.lower())
data

Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ì_ b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...


In [58]:
#Tokenization
def preprocess(text):
    text=re.sub(r"[^a-zA-Z]"," ",text)
    words=word_tokenize(text)
    stop_words=set(stopwords.words('english'))
    words=[word for word in words if word not in stop_words]
    return ' '.join(words)

data['tokenized_text']=data['v2'].apply(preprocess)
data

Unnamed: 0,v1,v2,tokenized_text
0,ham,"go until jurong point, crazy.. available only ...",go jurong point crazy available bugis n great ...
1,ham,ok lar... joking wif u oni...,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say so early hor... u c already then say...,u dun say early hor u c already say
4,ham,"nah i don't think he goes to usf, he lives aro...",nah think goes usf lives around though
...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5568,ham,will ì_ b going to esplanade fr home?,b going esplanade fr home
5569,ham,"pity, * was in mood for that. so...any other s...",pity mood suggestions
5570,ham,the guy did some bitching but i acted like i'd...,guy bitching acted like interested buying some...


In [59]:
#TF-IDF Vectorization
tfidf_vectorizer=TfidfVectorizer()
X=tfidf_vectorizer.fit_transform(data['tokenized_text'])

In [60]:
#Data splitting
X_train,X_test,Y_train,Y_test=train_test_split(X,data['v1'],test_size=0.2,random_state=42)

In [61]:
#Training a Multinomial Naive Bayes Model 
multiNB=MultinomialNB()
multiNB.fit(X_train,Y_train)
multiNBPre=multiNB.predict(X_test)

In [62]:
#Training a Bernoulli Naive Bayes Model 
BerNB=BernoulliNB()
BerNB.fit(X_train,Y_train)
BerNBPre=BerNB.predict(X_test)

In [63]:
#Training a Gaussian Naive Bayes Model 
gaussNB=GaussianNB()
gaussNB.fit(X_train.toarray(),Y_train)
gaussNBPre=gaussNB.predict(X_test.toarray())

In [64]:
#Evaluating the accuracy on the test set for the models
print("From Multinomial model",accuracy_score(Y_test,multiNBPre))
print("From Bernoulli model",accuracy_score(Y_test,BerNBPre))
print("From Gaussian model",accuracy_score(Y_test,gaussNBPre))

From Multinomial model 0.9690522243713733
From Bernoulli model 0.9709864603481625
From Gaussian model 0.8733075435203095


In [65]:
#According to the Accuracy Score the best model for email spam detection is Bernoulli Naive Bayes Model

In [66]:
#Saving the best model
joblib.dump(BerNB,'model_joblib.pkl')

['model_joblib.pkl']

In [67]:
#Saving the TF-IDF vectorizer using pickle
with open('tfidf_vectorizer.pkl','wb') as file:
    pickle.dump(tfidf_vectorizer,file)