### In this project, we will develop a Naive Bayes model that classify SMS messages as spam or not.
### It will be based on training data provided by us.


Download datasets from below link

https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [1]:
import numpy as np
import pandas as pd
df=pd.read_table('SMSSpamCollection', sep='\t', 
                   names=['label','sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing

In [2]:
# Conversion
df['label'] = df.label.map({'ham':0, 'spam':1})
# Print dataset shape
df.shape

(5572, 2)

### Bag of Words is a term used to specify the problems that have a collection of text data that needs to be processed. The idea is to take a piece of the text and count the frequency of the words in the text.


In [3]:
# Define the documents
documents = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']
# Import the count vectorizer and initialize it
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
# Print the 'count_vector' object which is an instance of 'CountVectorizer()'
print(count_vector)

CountVectorizer()


In [4]:
count_vector.fit(documents)
names = count_vector.get_feature_names()
names

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [5]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [6]:
frequency_matrix = pd.DataFrame(data=doc_array, columns=names)
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


# Splitting Dataset in Training and Testing Sets


In [7]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], random_state=1)
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [8]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)
# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [9]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB()

In [10]:
predictions = naive_bayes.predict(testing_data)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


In [12]:
import urllib
url="https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
#raw_data=urllib.urlopen(url)
dataset=np.loadtxt("spambase.data", delimiter=',')
dataset[0]

array([  0.   ,   0.64 ,   0.64 ,   0.   ,   0.32 ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.64 ,   0.   ,   0.   ,
         0.   ,   0.32 ,   0.   ,   1.29 ,   1.93 ,   0.   ,   0.96 ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.778,   0.   ,   0.   ,   3.756,  61.   ,
       278.   ,   1.   ])

In [13]:
dataset.shape

(4601, 58)

In [13]:
X=dataset[:,0:48]
y=dataset[:,-1]
print(X.shape,'\n','\n',y.shape)

(4601, 48) 
 
 (4601,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=17)


In [15]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB


In [24]:
bern=BernoulliNB(binarize=True)
bern.fit(X_train,y_train)
print(bern)
y_expect = y_test
y_pred=bern.predict(X_test)
accuracy_score(y_expect,y_pred)

BernoulliNB(alpha=1.0, binarize=True, class_prior=None, fit_prior=True)


0.8558262014483212

In [25]:
multi=MultinomialNB()
multi.fit(X_train,y_train)
print(multi)
y_expect = y_test
y_pred=multi.predict(X_test)
accuracy_score(y_expect,y_pred)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


0.8736010533245556

In [26]:
gauss=GaussianNB()
gauss.fit(X_train,y_train)
print(gauss)
y_expect = y_test
y_pred=gauss.predict(X_test)
accuracy_score(y_expect,y_pred)

GaussianNB(priors=None, var_smoothing=1e-09)


0.8130348913759052

In [27]:
bern=BernoulliNB(binarize=0.1)
bern.fit(X_train,y_train)
print(bern)
y_expect = y_test
y_pred=bern.predict(X_test)
accuracy_score(y_expect,y_pred)

BernoulliNB(alpha=1.0, binarize=0.1, class_prior=None, fit_prior=True)


0.8953258722843976