## Spam filter using Naive Bayes from scratch 

### Multinomial Naive Bayes 

In [42]:
import pandas as pd

sms_data = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])

sms_data.shape

(5572, 2)

In [44]:
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [45]:
sms_data['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [46]:
# Before splitting dataset into training and testing dataset, 
# I will be shuffling the dataset and randomizing it to split training and test dataset 
#with balanced labels (having same ham:spam in test dataset and train dataset both)

# Splitting with 70% training and 30% test dataset

# randomizing  dataset
randomized_data = sms_data.sample(frac=1, random_state=1)

# calculating the index for splitting
index = round(len(randomized_data) * 0.7)

# splitting into training and test datasets
train_data = randomized_data[:index].reset_index(drop=True)
test_data = randomized_data[index:].reset_index(drop=True)


In [47]:
print(train_data.shape)
print(test_data.shape)

(3900, 2)
(1672, 2)


In [48]:
train_data['Label'].value_counts(normalize=True)

ham     0.865897
spam    0.134103
Name: Label, dtype: float64

In [49]:
test_data['Label'].value_counts(normalize=True)

ham     0.866029
spam    0.133971
Name: Label, dtype: float64

In [50]:
#Data cleaning is required as all words need to be extracted separately. 
#Words in the SMS are the features 

#Removing punctuations 
train_data['SMS'] = train_data['SMS'].str.replace('\W', ' ')

# all words to lower
train_data['SMS'] = train_data['SMS'].str.lower()


In [51]:
# Creating bag of words/ vocabulary from words in SMS 

train_data['SMS'] = train_data['SMS'].str.split()


In [53]:
vocab = []
for sms in train_data['SMS']:
    for w in sms:
#         print(w)
        vocab.append(w)

vocab = list(set(vocab))


In [58]:
len(vocab)

7212

In [59]:
vocab

['true',
 '0870753331018',
 'priority',
 'nokia',
 'xy',
 '89693',
 'bid',
 '300',
 'lover',
 'rcd',
 'body',
 'urination',
 'land',
 '88600',
 'prayers',
 'tough',
 'hoped',
 'nasty',
 '25p',
 'u2moro',
 'power',
 'prospects',
 'remembr',
 'promise',
 'scary',
 'sweets',
 'edison',
 'mayb',
 'busy',
 'speechless',
 'charlie',
 '24m',
 'totes',
 'burrito',
 'banks',
 'whispers',
 'youre',
 'reacting',
 'geeee',
 'student',
 'velly',
 'uv',
 'plaid',
 '87021',
 'thread',
 '42810',
 'childrens',
 'meive',
 'area',
 '946',
 'monkey',
 'themob',
 'reverse',
 'dined',
 'cheese',
 'strongly',
 'science',
 'sh',
 'hundred',
 'wants',
 'beerage',
 '08718726270',
 'cardiff',
 'uawake',
 'meal',
 'breathe1',
 'suzy',
 'pale',
 'patrick',
 '07973788240',
 'infernal',
 'email',
 'raed',
 'filthyguys',
 'eighth',
 'complacent',
 'sack',
 'butt',
 'got',
 'wahala',
 'fooled',
 'hour',
 'local',
 'realised',
 'yaxx',
 'landline',
 'prey',
 'toll',
 'appy',
 'soryda',
 'role',
 'helping',
 '20',
 'lun

In [60]:
# Now need to create a table of frequency of each word per SMS 

word_freq_per_sms = {word: [0] * len(train_data['SMS']) for word in vocab}

for i, sms in enumerate(train_data['SMS']):
    for word in sms:
        word_freq_per_sms[word][i] += 1

In [92]:
word_freq = pd.DataFrame(word_freq_per_sms)
word_freq.head()

Unnamed: 0,true,0870753331018,priority,nokia,xy,89693,bid,300,lover,rcd,...,blocked,vomiting,argue,curious,latr,juliana,pobox202,poortiyagi,dough,gving
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
train_data_cleaned = pd.concat([train_data, word_freq], axis=1)
train_data_cleaned.head()

Unnamed: 0,Label,SMS,true,0870753331018,priority,nokia,xy,89693,bid,300,...,blocked,vomiting,argue,curious,latr,juliana,pobox202,poortiyagi,dough,gving
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
#Now to use multinomial naive bayes algorithm, need to calculate two probabilities:
# P(spam | w1,w2,w3,....,wn) and P(ham | w1,w2,w3,....,wn)

# first separating spam and ham messages 
spam_sms = train_data_cleaned[train_data_cleaned['Label'] == 'spam']
ham_sms = train_data_cleaned[train_data_cleaned['Label'] == 'ham']

In [95]:
# P(Spam) and P(Ham)
p_spam = len(spam_sms) / len(train_data)
p_ham = len(ham_sms) / len(train_data)

# N_Spam
n_words_per_spam_sms = spam_sms['SMS'].apply(len)
n_spam = n_words_per_spam_sms.sum()

# N_Ham
n_words_per_ham_sms = ham_sms['SMS'].apply(len)
n_ham = n_words_per_ham_sms.sum()

# N_Vocabulary
n_vocab = len(vocab)

# Laplace smoothing
alpha = 1

In [100]:
# initiating parameters
param_spam = {word:0 for word in vocab}
param_ham = {word:0 for word in vocab}

In [101]:
# calculating parameters
for word in vocab:
    n_word_given_spam = spam_sms[word].sum() # spam_messages already defined
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocab)
    param_spam[word] = p_word_given_spam

    n_word_given_ham = ham_sms[word].sum() # ham_messages already defined
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocab)
    param_ham[word] = p_word_given_ham

In [108]:
# function to classify new messages 
import re

def classification(sms):
    sms = re.sub('\W', ' ', sms)
    sms = sms.lower().split()

    p_spam_sms = p_spam
    p_ham_sms = p_ham

    for word in sms:
        if word in param_spam:
            p_spam_sms *= param_spam[word]

        if word in param_ham: 
            p_ham_sms *= param_ham[word]

    print('P(Spam|sms):', p_spam_sms)
    print('P(Ham|sms):', p_ham_sms)

    if p_ham_sms > p_spam_sms:
        print('Label: Ham')
    elif p_ham_sms < p_spam_sms:
        print('Label: Spam')
    else:
        print('Equal probabilities!')
    

In [109]:
classification('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|sms): 5.904203202197771e-26
P(Ham|sms): 2.9520580445641527e-27
Label: Spam


In [110]:
classification("Sounds good, Tom, then see u there")

P(Spam|sms): 2.1098884398714407e-25
P(Ham|sms): 4.453247150467117e-21
Label: Ham


In [111]:
#method to test classification of naive bayes

def test_classification(sms):
    sms = re.sub('\W', ' ', sms)
    sms = sms.lower().split()

    p_spam_sms = p_spam
    p_ham_sms = p_ham

    for word in sms:
        if word in param_spam:
            p_spam_sms *= param_spam[word]

        if word in param_ham: 
            p_ham_sms *= param_ham[word]

    if p_ham_sms > p_spam_sms:
        return 'ham'
    elif p_ham_sms < p_spam_sms:
        return 'spam'
    else:
        return 'undetermined'
    


In [112]:
test_data['predicted'] = test_data['SMS'].apply(test_classification)
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"Camera quite good, 10.1mega pixels, 3optical a...",ham
1,ham,At 4. Let's go to bill millers,ham
2,ham,Is there coming friday is leave for pongal?do ...,ham
3,spam,WINNER! As a valued network customer you hvae ...,spam
4,ham,Yar... I tot u knew dis would happen long ago ...,ham


In [115]:
#Evaluate classification function 
# by calculating accuracy ( acc = correctly classified / total classfied)

correct = 0 
total = test_data.shape[0]

for row in test_data.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

print('Accuracy:', correct/total)


Accuracy: 0.986244019138756


### Achieved an accuracy of 98.6 % using multinomial naive bayes classifier 