In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data= pd.read_csv("sms_raw_NB.csv")
data.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or 澹10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [3]:
data.shape

(5559, 2)

In [4]:
data.drop_duplicates(keep='first')

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or 澹10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [5]:
#Data cleaning and preprocessing
#re=Regular Expression
import re

In [6]:
# when ever ur using stopwords,lematization,bag of words
import nltk


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
#StopWords are used to remove words that are of no use [to,of,for,etc..]
from nltk.corpus import stopwords


In [10]:
#PorterStemmer is used for Stemming and stemming is used to find the base root of the word
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [11]:
corpus=[]

In [13]:
for i in range(0 ,len(data['text'])):
    review=re.sub('[^a-zA-Z]',' ',data['text'][i])#remove all other than a-zA-Z
    review=review.lower()
    review=review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
#train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(corpus,data.type,test_size=0.3)

In [15]:
#TfidfVectorizer
vectorizer=TfidfVectorizer()

In [16]:
X_train_transformed=vectorizer.fit_transform(X_train)
X_test_transformed=vectorizer.transform(X_test)

In [17]:
X_train_transformed=(X_train_transformed).toarray()
X_test_transformed=(X_test_transformed).toarray()

In [18]:
#GaussianNB
m1=GaussianNB()

m1.fit(X_train_transformed,Y_train)
y_predict1=m1.predict(X_test_transformed)
y_predict1

array(['ham', 'ham', 'spam', ..., 'spam', 'ham', 'spam'], dtype='<U4')

In [19]:
accuracy_score(Y_test,y_predict1)

0.8597122302158273

In [20]:
confusion_matrix(Y_test,y_predict1)

array([[1230,  211],
       [  23,  204]])

In [21]:
np.mean(Y_test==y_predict1)

0.8597122302158273

In [22]:
#BernoulliNB
m2=BernoulliNB()
m2.fit(X_train_transformed,Y_train)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [23]:
pred=m2.predict(X_test_transformed)


In [24]:
accuracy_score(Y_test,pred)

0.9784172661870504

In [25]:
#MultinomialNB
m3 =MultinomialNB()
m3.fit(X_train_transformed,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
pred1=m3.predict(X_test_transformed)

In [27]:
accuracy_score(Y_test,pred1)

0.9688249400479616