# Spam Classifier in Naive Bayes

In [1]:
import numpy as np
import pandas as pd
dataset = pd.read_table('SMSSpamCollection', sep='\t', header = None, names = ['Label', 'message'])

dataset.head()

Unnamed: 0,Label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# List of files required
!ls

Bayesian_Inference-Copy1.ipynb
SMSSpamCollection
Spam_Classifier.ipynb


In [3]:
# Converting ham and spam into 0 and 1 respectively
dicti = {'ham': 0, 'spam': 1}
dataset['Label'] = dataset.Label.map(dicti)
dataset.head()

Unnamed: 0,Label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
dataset.shape

(5572, 2)

In [5]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']
lower_case_documents = []

for i in documents:
    i = i.lower()
    lower_case_documents.append(i)
    
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [6]:
import string
sans_punctuation_documents = []
for i in lower_case_documents:
    x = []
    for word in i:
        for c in string.punctuation:
            word = word.replace(c, "")
        x.append(word)
    sans_punctuation_documents.append("".join(x))
        

print(sans_punctuation_documents)
        

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [7]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    i = i.split(" ")
    preprocessed_documents.append(i)
    
print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [8]:
frequency_list = []
from collections import Counter
import pprint  # Pretty Print

for i in preprocessed_documents:
    frequency_list.append(Counter(i))

pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [9]:
documents = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()


In [11]:
print(count_vector)

CountVectorizer()


In [12]:
count_vector.fit(documents)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

The `get_feature_names()` method returns our feature names for this dataset, which is the set of words that make up our vocabulary for 'documents'.

In [13]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [14]:
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(dataset['message'], dataset['Label'], random_state = 1)

print('Number of rows in the total set: {}'.format(dataset.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [16]:
count_vectorizer = CountVectorizer()
training_data = count_vectorizer.fit_transform(X_train)
testing_data = count_vectorizer.transform(X_test)

In [17]:
# P(D)
p_diabetes = 0.01

# P(~D)
p_no_diabetes = 0.99

# Sensitivity or P(Pos|D)
p_pos_diabetes = 0.9

# Specificity or P(Neg|~D)
p_neg_no_diabetes = 0.9

# P(Pos)
p_pos = p_diabetes*p_pos_diabetes + p_no_diabetes*(1-p_neg_no_diabetes)
print("Probability of getting a positive result is {}".format(p_pos))

Probability of getting a positive result is 0.10799999999999998


In [18]:
# P(D|Pos)
p_diabetes_pos = (p_diabetes * p_pos_diabetes)/ p_pos
print("Probability of an individual having diabetes, given that that individual got a positive test result is:{}".format(p_diabetes_pos))

Probability of an individual having diabetes, given that that individual got a positive test result is:0.08333333333333336


In [19]:
# P(Pos|~D)
p_pos_no_diabetes = 1 - p_pos_diabetes
# P(~D|Pos)
p_no_diabetes_pos = (p_no_diabetes) * p_pos_no_diabetes / p_pos
print('Probability of an individual having diabetes, given that that individual got a positive test result is:{}'.format(p_no_diabetes_pos))

Probability of an individual having diabetes, given that that individual got a positive test result is:0.9166666666666666


In [20]:
# P(J)
p_j = 0.5

# P(F/J)
p_f_j = 0.1

# P(I/J)
p_i_j = 0.1

p_j_text =  p_j * p_f_j * p_i_j
print(p_j_text)


0.005000000000000001


In [21]:
# P(G)
p_g = 0.5

# P(F/G)
p_f_g = 0.7

# P(I/G)
p_i_g = 0.2

p_g_text =  p_j * p_f_g * p_i_g
print(p_g_text)

0.06999999999999999


In [22]:
p_f_i = p_j_text + p_g_text
print('Probability of words freedom and immigration being said are: {}'.format(p_f_i))

Probability of words freedom and immigration being said are: 0.075


In [23]:
p_j_fi = (p_j * p_f_j * p_i_j / p_f_i)
print('The probability of Jill Stein saying the words Freedom and Immigration: {}'.format(p_j_fi))

The probability of Jill Stein saying the words Freedom and Immigration: 0.06666666666666668


In [24]:
p_g_fi = (p_g * p_f_g * p_i_g / p_f_i)
print('The probability of Gary Johnson saying the words Freedom and Immigration: {}'.format(p_g_fi))

The probability of Gary Johnson saying the words Freedom and Immigration: 0.9333333333333332


In [25]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)


MultinomialNB()

In [26]:
predictions = naive_bayes.predict(testing_data)


In [27]:
from sklearn.metrics import accuracy_score , precision_score, recall_score , f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
