#  Naive Bayes Classifiers

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

## Naive Bayes
### Using Naive Bayes to predict spam

In [3]:
#Use Latin encoding as the Data has non UFT-8 Chars
data = pd.read_csv("spam.csv",encoding='latin-1')

In [14]:
data.shape

(5572, 5)

In [36]:
data[data.v1=='spam']


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",,,
12,spam,URGENT! You have won a 1 week FREE membership ...,,,
15,spam,"XXXMobileMovieClub: To use your credit, click ...",,,
19,spam,England v Macedonia - dont miss the goals/team...,,,
34,spam,Thanks for your subscription to Ringtone UK yo...,,,
42,spam,07732584351 - Rodger Burns - MSG = We tried to...,,,


In [9]:
X =  data.v2
y = data.v1

In [11]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [13]:
X_test.shape

(1393,)

In [15]:
vectorizer = TfidfVectorizer()

In [16]:
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed  = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names()

In [17]:
len(feature_names)

7369

In [18]:
#### slim the data for training and testing
selector = SelectPercentile( percentile=10)
selector.fit(X_train_transformed, y_train)
X_train_transformed_per = selector.transform(X_train_transformed).toarray()
X_test_transformed_per  = selector.transform(X_test_transformed).toarray()

In [21]:
X_test_transformed_per.shape

(1393, 737)

In [22]:
clf = GaussianNB()
clf.fit(X_train_transformed_per, y_train)
y_predict = clf.predict(X_test_transformed_per)

In [23]:
print(accuracy_score(y_test, y_predict))

0.9727207465900933


In [24]:
confusion_matrix(y_test, y_predict)


array([[1180,   22],
       [  16,  175]])

In [55]:
print(classification_report(y_test, y_predict))

             precision    recall  f1-score   support

        ham       0.99      0.98      0.99      1211
       spam       0.89      0.91      0.90       182

avg / total       0.97      0.97      0.97      1393



In [39]:
NewEmail = pd.Series(["Urgent UR awarded a complimentary trip"])
NewEmail


0    Urgent UR awarded a complimentary trip
dtype: object

In [40]:
NewEmail_transformed = vectorizer.transform(NewEmail)
NewEmail_transformed  = selector.transform(NewEmail_transformed).toarray()
clf.predict(NewEmail_transformed)

array(['spam'], dtype='<U4')

In [76]:
clf_mul = MultinomialNB()
clf_mul.fit(X_train_transformed_per, y_train)
y_predict_mul = clf_mul.predict(X_test_transformed_per)

In [77]:
confusion_matrix(y_test, y_predict_mul)


array([[1211,    0],
       [  54,  128]])

In [78]:
accuracy_score(y_test, y_predict_mul)

0.9612347451543432

In [79]:
clf_ber = BernoulliNB()
clf_ber.fit(X_train_transformed_per, y_train)
y_predict_ber = clf_ber.predict(X_test_transformed_per)

In [80]:
accuracy_score(y_test, y_predict_ber)

0.9842067480258435

In [81]:
confusion_matrix(y_test, y_predict_ber)

array([[1208,    3],
       [  19,  163]])