In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile

# Data Preprocessing

In [3]:
df =pd.read_csv(r"D:\Data Science\spam.csv",encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.shape

(5572, 2)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(df.v2,df.v1,test_size = 0.25)

In [12]:
y_train.head()

515      ham
3109    spam
2088    spam
5462    spam
93      spam
Name: v1, dtype: object

In [13]:
x_train.head()

515                            S:)no competition for him.
3109    Good Luck! Draw takes place 28th Feb 06. Good ...
2088    Well done ENGLAND! Get the official poly ringt...
5462    Txt: CALL to No: 86888 & claim your reward of ...
93      Please call our customer service representativ...
Name: v2, dtype: object

In [14]:
vectorizer = TfidfVectorizer()

In [15]:
x_train_transformed = vectorizer.fit_transform(x_train)
x_test_transformed = vectorizer.transform(x_test)

In [16]:
features_names = vectorizer.get_feature_names()

In [17]:
len(features_names)

7599

In [19]:
selector = SelectPercentile(percentile=10)
selector.fit(x_train_transformed, y_train)
x_train_transformed = selector.transform(x_train_transformed).toarray()
x_test_transformed = selector.transform(x_test_transformed).toarray()


In [20]:
x_train_transformed 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Applying Naive Bayes

In [22]:
m1 = GaussianNB()

m1.fit(x_train_transformed,y_train)
y_predict = m1.predict(x_test_transformed)
y_predict

accuracy_score(y_test,y_predict)

0.9720028715003589

In [23]:
np.mean(y_test == y_predict)

0.9720028715003589

In [24]:
confusion_matrix(y_test,y_predict)

array([[1208,   24],
       [  15,  146]], dtype=int64)

In [25]:
(1177+166)/(1177+166+24+26)

0.9641062455132807

In [27]:
model_bernb = BernoulliNB()

model_bernb.fit(x_train_transformed,y_train)
y_predict = model_bernb.predict(x_test_transformed)


accuracy_score(y_test,y_predict)

0.9806173725771715

In [28]:
model_mulnb = MultinomialNB()

model_mulnb.fit(x_train_transformed,y_train)
y_predict = model_mulnb.predict(x_test_transformed)


accuracy_score(y_test,y_predict)

0.9576453697056713

In [29]:
newEmail = pd.Series('On Time Painting with Money Back* Guarantee')

In [30]:
newEmail_transformed = vectorizer.transform(newEmail)
newEmail_transformed = selector.transform(newEmail_transformed).toarray()

In [31]:
m1.predict(newEmail_transformed)

array(['ham'], dtype='<U4')