In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB
import re

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
sms_spam['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [3]:
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Training/Test split
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)


(4458, 2)
(1114, 2)


In [4]:
training_set['Label'].value_counts(normalize=True, dropna = False)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [5]:
test_set['Label'].value_counts(normalize = True, dropna = False)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

In [6]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [7]:
#Remove all punctuation marks
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')

#Change all letters to small case
training_set['SMS'] = training_set['SMS'].str.lower()

  training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')


In [8]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [9]:
#split the sms column on white space and convert each row to a list
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)

#Set returns tuple of unique words
vocabulary = list(set(vocabulary))
len(vocabulary)

7783

In [10]:

#Create a default dictionary with each unique word a count of zero
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}
#print(list(word_counts_per_sms.items())[:4])


In [11]:
#Loop over the training set and count the number of times each unique word occurs
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
#print(list(word_counts_per_sms.items())[1:2])

In [12]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,edu,fraction,tnc,awww,demand,food,deluxe,taunton,net,directors,...,sculpture,see,xavier,cya,impatient,url,othrs,winning,mountains,spoken
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
word_counts['smth'].value_counts(dropna = False)

0    4444
1      13
2       1
Name: smth, dtype: int64

In [14]:
word_counts['the'].value_counts(dropna = False)

0     3622
1      661
2      133
3       29
4       10
8        1
5        1
10       1
Name: the, dtype: int64

In [15]:
word_counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4458 entries, 0 to 4457
Columns: 7783 entries, edu to spoken
dtypes: int64(7783)
memory usage: 264.7 MB


In [16]:
training_set.head()
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,edu,fraction,tnc,awww,demand,food,deluxe,taunton,...,sculpture,see,xavier,cya,impatient,url,othrs,winning,mountains,spoken
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."


In [18]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
#print(n_words_per_spam_message)
#print(spam_messages)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1


In [19]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()   # spam_messages already defined in a cell above
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + (alpha*n_vocabulary))
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()   # ham_messages already defined in a cell above
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + (alpha*n_vocabulary))
    parameters_ham[word] = p_word_given_ham

In [20]:
# Initial parameter for bernoulli_nb i.e multivariant

parameters_spam_NB = {unique_word:0 for unique_word in vocabulary}
parameters_ham_NB = {unique_word:0 for unique_word in vocabulary}
#pdb.set_trace()

# Calculate parameters
spam_messages_df = pd.DataFrame(spam_messages)
ham_messages_df = pd.DataFrame(ham_messages)

n_spam_messages_doc_count=len(spam_messages)
n_ham_messages_doc_count= len(ham_messages)

for word in vocabulary:
    n_doc_spam_word= (spam_messages_df[word] != 0).sum()
    
    #fraction of spam doc which has the word 
    p_word_given_spam_NB = (n_doc_spam_word+1)/(n_spam_messages_doc_count+2)
    parameters_spam_NB[word] = p_word_given_spam_NB
    
    #no of ham doc having the word
    n_doc_ham_word = (np.count_nonzero(ham_messages_df[word]))
    
    #fraction of ham doc which has the word 
    p_word_given_ham_NB = (n_doc_ham_word+1)/(n_ham_messages_doc_count+2)     
    parameters_ham_NB[word] = p_word_given_ham_NB
    


In [21]:
def classify_test_set(message):    
    '''
    message: a string
    '''
    
    #remove all punctuations
    message = re.sub('\W', ' ', message)
    
    #lower and split all words on white space
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
            
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'Not able to estimate'

In [22]:
import math
def classify_test_set_multivariant(message):    
    '''
    message: a string
    '''
    
    #remove all punctuations
    message = re.sub('\W', ' ', message)
    
    #lower and split all words on white space
    message = message.lower().split()
    
    #p(c=spam)
    p_spam_given_message_NB = math.log(p_spam)
    #p(c=ham)
    p_ham_given_message_NB = math.log(p_ham)
    
    #p(C=k)p(D|C=k) 
    # p(Di∣C)= bit*p(wt∣C)+(1−bit)*(1−p(wt∣C))
    
    #bit  is either 0 or 1 representing the absence or presence of word wt in the ith document.
    
    for word in vocabulary:
        # check if word is in message
        if word in message:
            p_spam_given_message_NB += math.log(parameters_spam_NB[word])
            #pdb.set_trace()
            p_ham_given_message_NB += math.log(parameters_ham_NB[word])
        else:
            #word not in vocabulary ie bit=0
            p_spam_given_message_NB += math.log(1-parameters_spam_NB[word])
            p_ham_given_message_NB += math.log (1-parameters_ham_NB[word])       
         
    
    if p_ham_given_message_NB > p_spam_given_message_NB:
        return 'ham'
    elif p_spam_given_message_NB > p_ham_given_message_NB:
        return 'spam'
    else:
        return 'We are not able to estimate'

In [23]:
#Create a new column showing the result of our algorithm
test_set['predicted_multinomial'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted_multinomial
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [24]:
correct = 0
total = test_set.shape[0]
    
for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted_multinomial']:
        correct += 1
print("Multinomial")        
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', str(correct/total * 100) + '%')

Multinomial
Correct: 1100
Incorrect: 14
Accuracy: 98.74326750448833%


In [25]:
#Create a new column showing the result of our algorithm
test_set['predicted_multivariant'] = test_set['SMS'].apply(classify_test_set_multivariant)
test_set.head()

Unnamed: 0,Label,SMS,predicted_multinomial,predicted_multivariant
0,ham,Later i guess. I needa do mcat study too.,ham,ham
1,ham,But i haf enuff space got like 4 mb...,ham,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham,ham


In [26]:
correct = 0
total = test_set.shape[0]
    
for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted_multivariant']:
        correct += 1
print("Multivariant")        
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', str(correct/total * 100) + '%')

Multivariant
Correct: 1094
Incorrect: 20
Accuracy: 98.20466786355476%
