# Spam Filter with Naive Bayes
### We will build a spam filter of SMS messages using the multinomial Naive Bayes algorithm. The goal is for the spam filter to evaluate the message as spam or ham with an accuracy of 80% or better.

### The training data will consist of a 5,572 SMS message dataset of mobile phone communications from the SMS Spam Collection Data Set of the UCI Machine Learning Repository.  Tiago A. Almeida and José María Gómez Hidalgo assembled the repository from free or free for research resources on the internet.


In [1]:
import pandas as pd
sms_spam_df = pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['label','SMS'])

In [2]:
print(sms_spam_df.shape)
sms_spam_df.describe()

(5572, 2)


Unnamed: 0,label,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [3]:
sms_spam_df.head()

Unnamed: 0,label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Next, the percentage of spam is 13.4% and "ham" 86.6%. This seems resonable as most messages are normally non-spam.

In [4]:
sms_spam_df['label'].value_counts(normalize=True) * 100

ham     86.593683
spam    13.406317
Name: label, dtype: float64

## Split the Data into a Training Set and a Test Set

In [5]:
# Randomize the dataset
sms_spam_randomized = sms_spam_df.sample(frac=1, random_state=1)
# find index for 80/20 split between training set and test set
train_ndx_range = round(len(sms_spam_randomized) * 0.8)
# create a training dataframe and a test dataframe
training_set = sms_spam_randomized[:train_ndx_range].reset_index(drop=True)
test_set = sms_spam_randomized[train_ndx_range:].reset_index(drop=True)

In [6]:
# The test set and training set are very similiar in terms of the proportion of 
#     'Spam' vs. 'Ham' labelled messages, consistent with the random sampling
print(training_set['label'].value_counts(normalize=True) * 100)
print(test_set['label'].value_counts(normalize=True) * 100)

ham     86.54105
spam    13.45895
Name: label, dtype: float64
ham     86.804309
spam    13.195691
Name: label, dtype: float64


In [7]:
training_set.head()

Unnamed: 0,label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [8]:
test_set.head()

Unnamed: 0,label,SMS
0,ham,Later i guess. I needa do mcat study too.
1,ham,But i haf enuff space got like 4 mb...
2,spam,Had your mobile 10 mths? Update to latest Oran...
3,ham,All sounds good. Fingers . Makes it difficult ...
4,ham,"All done, all handed in. Don't know if mega sh..."


## Data Cleaning: Removing Punctuation, Lower Casing

In [9]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()

In [10]:
training_set.head()

Unnamed: 0,label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [11]:
test_set['SMS'] = test_set['SMS'].str.replace('\W', ' ')
test_set['SMS'] = test_set['SMS'].str.lower()

In [12]:
test_set.head()

Unnamed: 0,label,SMS
0,ham,later i guess i needa do mcat study too
1,ham,but i haf enuff space got like 4 mb
2,spam,had your mobile 10 mths update to latest oran...
3,ham,all sounds good fingers makes it difficult ...
4,ham,all done all handed in don t know if mega sh...


## Create The Vocabulary

In [13]:
training_set['SMS'] = training_set['SMS'].str.split()
training_set.head()

Unnamed: 0,label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."


#### The vocabulary consists of 7783 words

In [14]:
vocabulary = []
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)
vocabulary = list(set(vocabulary))
len(vocabulary)

7783

In [15]:
# build dictionary word_counts_per_sms for tracking count of word occurences
training_set_len = len(training_set['SMS'])
word_counts_per_sms = {unique_word: [0] * training_set_len 
                       for unique_word in vocabulary}
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [16]:
# Print first five items in word_counts_per_sms dictionary
i= 0
for word,nlist in word_counts_per_sms.items():
        i += 1
        if i > 5:
            break
        print(word,': ',nlist,'\n')

consider :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [17]:
word_counts = pd.DataFrame(word_counts_per_sms)
training_set_clean = pd.concat([training_set,word_counts],axis=1)
training_set_clean.head()

Unnamed: 0,label,SMS,0,00,000,000pes,008704050406,0089,01223585334,02,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


### Naive Bayes Algorithm: Calculate Probabilities

In [18]:
training_set_clean.shape

(4458, 7785)

We're now done with cleaning the training set, and we can begin creating the spam filter. The Naive Bayes algorithm will need to answer these two probability questions to be able to classify new messages:

$$
P(Spam | w_1,w_2, ..., w_n) \propto P(Spam) \cdot \prod_{i=1}^{n}P(w_i|Spam)
$$$$
P(Ham | w_1,w_2, ..., w_n) \propto P(Ham) \cdot \prod_{i=1}^{n}P(w_i|Ham)
$$
Also, to calculate P(wi|Spam) and P(wi|Ham) inside the formulas above, we'll need to use these equations:

$$
P(w_i|Spam) = \frac{N_{w_i|Spam} + \alpha}{N_{Spam} + \alpha \cdot N_{Vocabulary}}
$$$$
P(w_i|Ham) = \frac{N_{w_i|Ham} + \alpha}{N_{Ham} + \alpha \cdot N_{Vocabulary}}
$$
Some of the terms in the four equations above will have the same value for every new message. We can calculate the value of these terms once and avoid doing the computations again when a new messages comes in. Below, we'll use our training set to calculate:

P(Spam) and P(Ham)
NSpam, NHam, NVocabulary
We'll also use Laplace smoothing and set $\alpha = 1$.

In [19]:
# Separate messages into spam and ham items
spam_messages = training_set_clean[training_set_clean['label']=='spam']
ham_messages = training_set_clean[training_set_clean['label']=='ham']

# probability of spam vs. ham:  p(spam) and p(ham)
p_spam = len(spam_messages)/len(training_set_clean)
p_ham = len(ham_messages)/len(training_set_clean)

#  n_spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# n_ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# n_vocabulary
n_vocabulary = len(vocabulary)
#
alpha = 1

In [20]:
print(n_spam)

15190


In [21]:
# Calculate parameters
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}
for word in vocabulary:
    # p(word|spam)
    n_word_given_spam = spam_messages[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha)/(n_spam + alpha * n_vocabulary)
    # p(word|ham)
    n_word_given_ham = ham_messages[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha)/(n_ham + (alpha * n_vocabulary))
    # update parameters
    parameters_spam[word] = p_word_given_spam
    parameters_ham[word] = p_word_given_ham

In [22]:
import re

def classify(message):
    
    '''
    message: a string
    '''

    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
   
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [23]:
message1 ='WINNER!! This is the secret code to unlock the money: C3421.'
classify(message1)

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [24]:
message2 = "Sounds good, Tom, then see u there"
classify(message1)

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [25]:
# Final version of Classify that returns evaluation of message

def classify_test_set(message):
    
    '''
    message: a string
    '''

    message = re.sub('\W', ' ', message)
    message = message.lower().split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
   
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [26]:
# Make Predictions
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,label,SMS,predicted
0,ham,later i guess i needa do mcat study too,ham
1,ham,but i haf enuff space got like 4 mb,ham
2,spam,had your mobile 10 mths update to latest oran...,spam
3,ham,all sounds good fingers makes it difficult ...,ham
4,ham,all done all handed in don t know if mega sh...,ham


In [27]:
# initialize variables 
correct = 0
total = len(test_set)
print(total)

1114


In [28]:
for row in test_set.iterrows():
    row = row[1]
    if row['predicted'] == row['label']:
        correct += 1

accuracy = correct/total
print(accuracy)

0.9874326750448833
