# Spam filter for SMS messages

Our first task is to "teach" the computer how to classify messages. To do that, we'll use the multinomial Naive Bayes algorithm along with a dataset of 5,572 SMS messages that are already classified by humans.

In [1]:
import pandas as pd

In [2]:
sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', 
                                  header=None, 
                                  names=['Label', 'SMS']
                                 )

In [3]:
sms_spam.shape

(5572, 2)

In [4]:
sms_spam.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Percentage of spam and non-spam in the df
percentage_spam = sms_spam['Label'].value_counts(normalize=True)
percentage_spam

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [6]:
# Randomizing the entire dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)

# Split the randomized dataset into a training and a test set
test_number = round(len(data_randomized)* 0.8)
print(test_number)

4458


In [7]:
# Test dataframe and training dataframe
training_set = data_randomized[:test_number].reset_index(drop=True)
test_set = data_randomized[test_number:].reset_index(drop=True)

print(len(training_set))
print(len(test_set))

4458
1114


In [8]:
# Find the percentage of spam in the training and the test set. 
training_set['Label'].value_counts(normalize=True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [9]:
test_set['Label'].value_counts(normalize=True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

# Data cleaning

In [10]:
# Remove all the punctuation from the SMS column. 
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


# Create the Vocabulary

In [11]:
# transforming each message from the SMS column into a list
training_set['SMS'] = training_set['SMS'].str.split()

In [12]:
# Initiate an empty list named vocabulary.
vocabulary = []
# Using a nested loop, iterate each message in the SMS column
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)
print(len(vocabulary))

72427


In [13]:
# Transform the vocabulary list into a set. This will remove duplicates.
# Transform the vocabulary set back into a list 
vocabulary = list(set(vocabulary))
print(len(vocabulary))

7783


# The Final Training Set

In [14]:
# The code [0] * len(training_set['SMS']) outputs 
# a list of the length of training_set['SMS'], 
# where each element in the list will be a 0 0]:

word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

#cada columna de la palabra tiene una longitud de 0 igual al número de filas del df

In [15]:
# We loop over training_set['SMS'] using the enumerate() 
# to get both the index and the SMS message (index and sms).

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [16]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [17]:
word_counts.head()

Unnamed: 0,china,help08714742804,08718711108,guessing,do,attended,okors,08000407165,bfore,explicit,...,xclusive,adjustable,sucks,fact,tactful,09050002311,rest,kusruthi,these,84122
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,china,help08714742804,08718711108,guessing,do,attended,okors,08000407165,...,xclusive,adjustable,sucks,fact,tactful,09050002311,rest,kusruthi,these,84122
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Calculate P(Spam) and P(Ham)
spam_sms = training_set_clean[training_set_clean['Label'] == 'spam']
ham_sms = training_set_clean[training_set_clean['Label'] == 'ham']

p_spam = len(spam_sms) / len(training_set_clean)
p_ham = len(ham_sms) / len(training_set_clean)

In [20]:
# N_Spam
n_words_spam = spam_sms['SMS'].apply(len)
n_spam = n_words_spam.sum()

# N_Ham
n_words_ham = ham_sms['SMS'].apply(len)
n_ham = n_words_ham.sum()

In [21]:
# N_Vocabulary
n_vocabulary = len(vocabulary)

In [22]:
# Laplace smoothing
alpha = 1

In [23]:
# Initialize two dictionaries, where each key-value pair 
# is a unique word (from our vocabulary) represented 
# as a string, and the value is 0.

parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

In [25]:
# Isolate the spam and the ham messages in the training set into two different DataFrames. 
# spam_sms and ham_sms

# Iterate over the vocabulary, and, for each word, calculate P(wi|Spam) and P(wi|Ham)

for word in vocabulary:
    n_word_given_spam = spam_sms[word].sum()
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_sms[word].sum()
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha * n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [26]:
parameters_ham

{'china': 4.6139649338665025e-05,
 'help08714742804': 1.537988311288834e-05,
 '08718711108': 1.537988311288834e-05,
 'guessing': 3.075976622577668e-05,
 'do': 0.004860043063672716,
 'attended': 3.075976622577668e-05,
 'okors': 3.075976622577668e-05,
 '08000407165': 1.537988311288834e-05,
 'bfore': 3.075976622577668e-05,
 'explicit': 1.537988311288834e-05,
 'hut': 3.075976622577668e-05,
 'tescos': 3.075976622577668e-05,
 'anyplaces': 3.075976622577668e-05,
 'stubborn': 3.075976622577668e-05,
 'emc1': 1.537988311288834e-05,
 'm39m51': 1.537988311288834e-05,
 'al': 0.00015379883112888343,
 'bud': 7.689941556444171e-05,
 'chinchillas': 3.075976622577668e-05,
 'slices': 3.075976622577668e-05,
 '80122300p': 1.537988311288834e-05,
 'box139': 1.537988311288834e-05,
 'department': 4.6139649338665025e-05,
 'moon': 9.227929867733005e-05,
 'ayn': 3.075976622577668e-05,
 '5digital': 3.075976622577668e-05,
 'but': 0.005444478621962473,
 'wizzle': 3.075976622577668e-05,
 'fulfil': 3.075976622577668e-

# Classifying A New Message

The spam filter can be understood as a function that:

* Takes in as input a new message (w1, w2, ..., wn)
* Calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn)
* Compares the values of P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn), and:
    * If P(Ham|w1, w2, ..., wn) > P(Spam|w1, w2, ..., wn), then the message is classified as ham.
    * If P(Ham|w1, w2, ..., wn) < P(Spam|w1, w2, ..., wn), then the message is classified as spam.
    * If P(Ham|w1, w2, ..., wn) = P(Spam|w1, w2, ..., wn), then the algorithm may request human help.

In [27]:
import re

def classify(message):
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal probabilities, have a human classify this!')


In [28]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25
P(Ham|message) 1.9368049028589875e-27
Label: Spam


In [29]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25
P(Ham|message) 3.687530435009238e-21
Label: Ham


# Testing our spam filter

In [30]:
# we'll change the classify() function that we wrote previously to return the labels instead of printing them. 

def classify_test_set(message):
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'need human classification'

In [31]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [32]:
correct = 0
total = len(test_set)

for row in test_set.iterrows():
    row = row[1] # el contenido de cada fila sin el índice, forma tupla
    if row['Label'] == row['predicted']:
        correct += 1

accuracy = correct/total*100
print('Correct: ', correct)
print('Total: ', total)
print('Percentage accuracy: ', round(accuracy, 2))

Correct:  1100
Total:  1114
Percentage accuracy:  98.74


Accuracy is better we expected (above 80%).

In [34]:
# Isolate the 14 messages that were classified incorrectly and try to figure out why the algorithm reached the wrong conclusions.
incorrect = 0

for row in test_set.iterrows():
    row = row[1] # el contenido de cada fila sin el índice, forma tupla
    if row['Label'] != row['predicted']:
        incorrect += 1

print(incorrect)

14


In [35]:
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [36]:
incorrect_set = test_set[test_set['Label'] != test_set['predicted']]

In [37]:
incorrect_set

Unnamed: 0,Label,SMS,predicted
114,spam,Not heard from U4 a while. Call me now am here...,ham
135,spam,More people are dogging in your area now. Call...,ham
152,ham,Unlimited texts. Limited minutes.,spam
159,ham,26th OF JULY,spam
284,ham,Nokia phone is lovly..,spam
293,ham,A Boy loved a gal. He propsd bt she didnt mind...,need human classification
302,ham,No calls..messages..missed calls,spam
319,ham,We have sent JD for Customer Service cum Accou...,spam
504,spam,Oh my god! I've found your number again! I'm s...,ham
546,spam,"Hi babe its Chloe, how r u? I was smashed on s...",ham
