In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile

In [2]:
# encoding='latin-1' is used to download all special characters and everything in python
data = pd.read_csv('../input/spam.csv',encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1,inplace=True)

In [7]:
data1 = data['v1']
data.shape

(5572, 2)

In [8]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
X_train, X_test,y_train, y_test = train_test_split(data.v2,data.v1,test_size=0.3)

In [10]:
X_train.shape

(3900,)

In [11]:
y_train.shape

(3900,)

In [12]:
y_test.shape

(1672,)

In [13]:
X_train.head()

703     Thats a bit weird, even ?- where is the do sup...
2658                           Not yet chikku..wat abt u?
754     Realy sorry-i don't recognise this number and ...
4273    Ball is moving a lot.will spin in last :)so ve...
2995    They released vday shirts and when u put it on...
Name: v2, dtype: object

In [14]:
vectorizer = TfidfVectorizer()

In [15]:
#don't fit model on test data
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [16]:
X_train_transformed

<3900x7222 sparse matrix of type '<class 'numpy.float64'>'
	with 51684 stored elements in Compressed Sparse Row format>

In [17]:
features_names = vectorizer.get_feature_names()

In [18]:
len(features_names)

7222

In [19]:
selector = SelectPercentile(percentile=5)
selector.fit(X_train_transformed, y_train)
X_train_transformed = selector.transform(X_train_transformed).toarray()
X_test_transformed = selector.transform(X_test_transformed).toarray()

In [20]:
X_train_transformed 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
## Applying Naive Bayes

In [22]:
m1 = GaussianNB()

m1.fit(X_train_transformed,y_train)
y_predict = m1.predict(X_test_transformed)
y_predict # Predicted Value
y_test # Actual Value
accuracy_score(y_test,y_predict)
#y_predict
#accuracy_score(y_test,y_predict)
np.mean(y_test == y_predict)
confusion_matrix(y_test,y_predict)

array([[1432,   21],
       [  31,  188]])

In [23]:
#from sklearn library used to learn accuracy
accuracy_score(y_test,y_predict)

0.9688995215311005

In [24]:
y_test.shape

(1672,)

In [25]:
np.mean(y_test == y_predict)

0.9688995215311005

In [26]:
confusion_matrix(y_test,y_predict)

array([[1432,   21],
       [  31,  188]])

In [27]:
(1435+187)/(1435+187+30+20)

0.9700956937799043

In [28]:
model_bernb = BernoulliNB()

model_bernb.fit(X_train_transformed,y_train)
y_predict = model_bernb.predict(X_test_transformed)


accuracy_score(y_test,y_predict)



0.9778708133971292

In [29]:
newEmail = pd.Series('hello how are you')

In [30]:
newEmail

0    hello how are you
dtype: object

In [31]:
newEmail_transformed = vectorizer.transform(newEmail)
newEmail_transformed = selector.transform(newEmail_transformed).toarray()

In [32]:
m1.predict(newEmail_transformed)

array(['ham'], dtype='<U4')

In [33]:
newEmail2 = pd.Series('WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.')

In [34]:
newEmail2

0    WINNER!! As a valued network customer you have...
dtype: object

In [35]:
newEmail_transformed2 = vectorizer.transform(newEmail2)
newEmail_transformed2 = selector.transform(newEmail_transformed2).toarray()

In [36]:
m1.predict(newEmail_transformed2)

array(['spam'], dtype='<U4')