#  Naive Bayes Classifiers

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

## Naive Bayes
### Using Naive Bayes to predict spam

In [24]:
#Use Latin encoding as the Data has non UTF-8 Chars
data = pd.read_csv("spamham.csv",encoding='latin-1')
print(data.shape)
data.head()

(5572, 2)


Unnamed: 0,type,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [25]:
X1 =  data.email
y = data.type

## Vectorization : Transforming TEXT to Vectors

In [26]:
vectorizer = TfidfVectorizer()#TermFrquencyInverse DocumnetFrequency
X = vectorizer.fit_transform(X1)
feature_names = vectorizer.get_feature_names()

In [27]:
len(feature_names)

8672

In [38]:
feature_names[2000:2010]

['chez',
 'chg',
 'chgs',
 'chic',
 'chick',
 'chicken',
 'chickened',
 'chief',
 'chik',
 'chikku']

In [29]:
X = X.toarray()

In [30]:
X.shape

(5572, 8672)

In [31]:
y.shape

(5572,)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [35]:
#Fitting Naive Bayes algo
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
model = BernoulliNB(alpha=0.74)
model.fit(X_train,y_train)
y_predict = model.predict(X_test)


In [36]:
from sklearn.metrics import classification_report
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
pd.crosstab(y_test,y_predict)

0.9874401913875598
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1446
        spam       1.00      0.91      0.95       226

    accuracy                           0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.99      1672



col_0,ham,spam
type,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1446,0
spam,21,205


In [12]:
X_train[0:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Checking new email for spam

In [39]:
#NewEmail = pd.Series(["Hi team, We have meeting tomorrow"])
#NewEmail = pd.Series(['**FREE MESSAGE**Thanks for using the Auction Subscription Service. 18 . 150p/MSGRCVD 2 Skip an Auction txt OUT. 2 Unsubscribe txt STOP CustomerCare 08718726270'])
NewEmail = pd.Series(['Hi .. This is Deepa. Are you available this Friday'])
NewEmail


0    Hi .. This is Deepa. Are you available this Fr...
dtype: object

In [40]:
NewEmail_transformed = vectorizer.transform(NewEmail)

In [45]:
NewEmail_transformed

<1x8672 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [41]:
NewEmail_transformed.shape

(1, 8672)

In [42]:
model.predict(NewEmail_transformed)

array(['ham'], dtype='<U4')

In [43]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])