# Build a naive Bayes model on the data set for classifying the ham and spam

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer


In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [4]:
email_data = pd.read_csv(r"C:\\Excelr Data\\Assignments\\Navie Byes\\sms_raw_NB.csv",encoding = "ISO-8859-1")

email_data

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [5]:
# cleaning data 
import re

In [6]:
#"this is awsome 1231312 $#%$# a i he yu nwj"
import re

In [7]:
def cleaning_text(i):
    i = re.sub("[^A-Za-z" "]+"," ",i).lower()
    i = re.sub("[0-9" "]+"," ",i)
    w = []
    for word in i.split(" "):
        if len(word)>2:
            w.append(word)
    return (" ".join(w))


In [8]:
email_data.text = email_data.text.apply(cleaning_text)
email_data.text

0             hope you are having good week just checking
1                                        give back thanks
2                        also doing cbe only but have pay
3       complimentary star ibiza holiday cash needs yo...
4       okmail dear dave this your final notice collec...
                              ...                        
5554    you are great role model you are giving much a...
5555    awesome remember the last time got somebody hi...
5556    you don your prize will another customer www b...
5557    sms jsco energy high but may not know where ch...
5558                      shall call now dear having food
Name: text, Length: 5559, dtype: object

In [9]:
# removing empty rows 
email_data.shape


(5559, 2)

In [10]:
email_data = email_data.loc[email_data.text != " ",:]
email_data 

Unnamed: 0,type,text
0,ham,hope you are having good week just checking
1,ham,give back thanks
2,ham,also doing cbe only but have pay
3,spam,complimentary star ibiza holiday cash needs yo...
4,spam,okmail dear dave this your final notice collec...
...,...,...
5554,ham,you are great role model you are giving much a...
5555,ham,awesome remember the last time got somebody hi...
5556,spam,you don your prize will another customer www b...
5557,spam,sms jsco energy high but may not know where ch...


In [11]:
def split_into_words(i):
    return (i.split(" "))


In [12]:
# splitting data into train and test data sets 
from sklearn.model_selection import train_test_split


In [13]:
email_train,email_test = train_test_split(email_data,test_size=0.3)


In [14]:
# Preparing email texts into word count matrix format 
emails_bow = CountVectorizer(analyzer=split_into_words).fit(email_data.text)
emails_bow

CountVectorizer(analyzer=<function split_into_words at 0x000001785FE2E558>)

In [15]:
# For all messages
all_emails_matrix = emails_bow.transform(email_data.text)
all_emails_matrix

<5559x7429 sparse matrix of type '<class 'numpy.int64'>'
	with 57658 stored elements in Compressed Sparse Row format>

In [16]:
all_emails_matrix.shape 

(5559, 7429)

In [17]:
# For training messages
train_emails_matrix = emails_bow.transform(email_train.text)
train_emails_matrix

<3891x7429 sparse matrix of type '<class 'numpy.int64'>'
	with 40193 stored elements in Compressed Sparse Row format>

In [18]:
train_emails_matrix.shape 


(3891, 7429)

In [19]:
# For testing messages
test_emails_matrix = emails_bow.transform(email_test.text)
test_emails_matrix

<1668x7429 sparse matrix of type '<class 'numpy.int64'>'
	with 17465 stored elements in Compressed Sparse Row format>

In [20]:
test_emails_matrix.shape 


(1668, 7429)

## Without TFIDF matrices ########################
# Preparing a naive bayes model on training data set 


In [21]:
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB


In [22]:
# Multinomial Naive Bayes
classifier_mb = MB()
classifier_mb.fit(train_emails_matrix,email_train.type)


MultinomialNB()

In [23]:
train_pred_m = classifier_mb.predict(train_emails_matrix)
train_pred_m

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'spam'], dtype='<U4')

In [24]:
accuracy_train_m = np.mean(train_pred_m==email_train.type) 
accuracy_train_m

0.9894628630172192

In [25]:
test_pred_m = classifier_mb.predict(test_emails_matrix)
test_pred_m

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'spam'], dtype='<U4')

In [26]:
accuracy_test_m = np.mean(test_pred_m==email_test.type) 
accuracy_test_m

0.9742206235011991

### Gaussian Naive Bayes 


In [27]:
classifier_gb = GB()
classifier_gb.fit(train_emails_matrix.toarray(),email_train.type.values) 

# we need to convert tfidf into array format which is compatible for gaussian naive bayes

GaussianNB()

In [28]:
train_pred_g = classifier_gb.predict(train_emails_matrix.toarray())
train_pred_g

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'spam'], dtype='<U4')

In [29]:
accuracy_train_g = np.mean(train_pred_g==email_train.type) 
accuracy_train_g

0.9362631714212285

In [30]:
test_pred_g = classifier_gb.predict(test_emails_matrix.toarray())
test_pred_g 

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'spam'], dtype='<U4')

In [31]:
accuracy_test_g = np.mean(test_pred_g==email_test.type) 
accuracy_test_g

0.8794964028776978

In [32]:
# Learning Term weighting and normalizing on entire emails
tfidf_transformer = TfidfTransformer().fit(all_emails_matrix)
tfidf_transformer

TfidfTransformer()

In [33]:
# Preparing TFIDF for train emails
train_tfidf = tfidf_transformer.transform(train_emails_matrix)
train_tfidf

<3891x7429 sparse matrix of type '<class 'numpy.float64'>'
	with 40193 stored elements in Compressed Sparse Row format>

In [34]:
train_tfidf.shape 

(3891, 7429)

In [35]:
# Preparing TFIDF for test emails
test_tfidf = tfidf_transformer.transform(test_emails_matrix)
test_tfidf

<1668x7429 sparse matrix of type '<class 'numpy.float64'>'
	with 17465 stored elements in Compressed Sparse Row format>

In [36]:
test_tfidf.shape 

(1668, 7429)

#### Preparing a naive bayes model on training data set 


In [37]:
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB


In [38]:
# Multinomial Naive Bayes


In [39]:
classifier_mb = MB()
classifier_mb.fit(train_tfidf,email_train.type)


MultinomialNB()

In [40]:
train_pred_m = classifier_mb.predict(train_tfidf)
train_pred_m

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'spam'], dtype='<U4')

In [41]:
accuracy_train_m = np.mean(train_pred_m==email_train.type) 
accuracy_train_m

0.9647905422770496

In [42]:
test_pred_m = classifier_mb.predict(test_tfidf)
accuracy_test_m = np.mean(test_pred_m==email_test.type) 
accuracy_test_m

0.9580335731414868

In [43]:
# Gaussian Naive Bayes 


In [44]:
classifier_gb = GB()
classifier_gb.fit(train_tfidf.toarray(),email_train.type.values)

# we need to convert tfidf into array format which is compatible for gaussian naive bayes


GaussianNB()

In [45]:
train_pred_g = classifier_gb.predict(train_tfidf.toarray())
accuracy_train_g = np.mean(train_pred_g==email_train.type) 
accuracy_train_g

0.9362631714212285

In [46]:
test_pred_g = classifier_gb.predict(test_tfidf.toarray())
accuracy_test_g = np.mean(test_pred_g==email_test.type) 
accuracy_test_g

0.8764988009592326

In [47]:
# inplace of tfidf we can also use train_emails_matrix and test_emails_matrix instead of term inverse document 
# frequency matrix 
