# Naive Baye's

In [24]:
# Import necessary packages
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
import numpy as np
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer # Convert to vectors
from sklearn.feature_selection import SelectPercentile

In [2]:
# Lead the data
data = pd.read_csv('spam.csv', encoding = 'latin-1')
data.head(2)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,


In [3]:
# Basic Checks
data.shape

(5572, 5)

In [4]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1, inplace=True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.isnull().sum()

v1    0
v2    0
dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data.v2,data.v1,test_size=0.3,random_state = 10)

In [7]:
X_train.shape

(3900,)

In [8]:
X_test.shape

(1672,)

In [9]:
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [10]:
X_train_transformed

<3900x7230 sparse matrix of type '<class 'numpy.float64'>'
	with 51794 stored elements in Compressed Sparse Row format>

In [11]:
feature_names = vectorizer.get_feature_names()

In [12]:
len(feature_names)

7230

In [13]:
selector = SelectPercentile(percentile=10)
selector.fit(X_train_transformed, y_train)
X_train_transformed = selector.transform(X_train_transformed).toarray()
X_test_transformed = selector.transform(X_test_transformed).toarray()

In [15]:
# Applying Naive Bayes
model_gaussianb = GaussianNB()
model_gaussianb.fit(X_train_transformed, y_train)
y_predict = model_gaussianb.predict(X_test_transformed)

In [16]:
accuracy_score(y_test,y_predict)

0.9688995215311005

In [17]:
pd.crosstab(y_test,y_predict)

col_0,ham,spam
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1412,34
spam,18,208


In [18]:
model_mulnb = MultinomialNB()
model_mulnb.fit(X_train_transformed, y_train)
y_predict = model_mulnb.predict(X_test_transformed)

In [19]:
accuracy_score(y_test,y_predict)

0.9407894736842105

In [20]:
pd.crosstab(y_test,y_predict)

col_0,ham,spam
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1446,0
spam,99,127


In [21]:
model_bernb = BernoulliNB()
model_bernb.fit(X_train_transformed, y_train)
y_predict = model_bernb.predict(X_test_transformed)

In [22]:
accuracy_score(y_test,y_predict)

0.9814593301435407

In [23]:
pd.crosstab(y_test,y_predict)

col_0,ham,spam
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1445,1
spam,30,196


In [32]:
newEmail = pd.Series(['I got free tickets, lets go!!'])

In [33]:
newEmail_transformed = vectorizer.transform(newEmail)
newEmail_transformed = selector.transform(newEmail_transformed).toarray()
model_bernb.predict(newEmail_transformed)

array(['ham'], dtype='<U4')