# Building a Spam Filter with Naive Bayes

In [1]:
import pandas as pd

In [2]:
emails = pd.read_csv('SMSSpamCollection', sep='\t', header=None, 
                    names=['Label','SMS'])
emails.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


ham means no spam

In [3]:
print("Dataset has {} observations and {} variables.".format(*emails.shape))

Dataset has 5572 observations and 2 variables.


In [4]:
emails['Label'].value_counts(normalize=True)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

#### Split data to test and train sets

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(emails['SMS'], emails['Label'],
                                                    random_state=1, train_size=.8)

In [7]:
y_test.value_counts(normalize=True)

ham     0.868161
spam    0.131839
Name: Label, dtype: float64

In [8]:
y_train.value_counts(normalize=True)

ham     0.86538
spam    0.13462
Name: Label, dtype: float64

In [9]:
X_train.head()

1642    Hi , where are you? We're at  and they're not ...
2899          If you r @ home then come down within 5 min
480     When're you guys getting back? G said you were...
3485    Tell my  bad character which u Dnt lik in me. ...
157                           I'm leaving my house now...
Name: SMS, dtype: object

Convert text to lower case remove all punctuation marks and sepratate words by one empty space

In [10]:
X_train = X_train.str.replace("[^A-Za-z0-9]", " ").str.lower()
X_train = X_train.str.replace("\s+", " ")
X_train.head()

1642    hi where are you we re at and they re not keen...
2899            if you r home then come down within 5 min
480     when re you guys getting back g said you were ...
3485    tell my bad character which u dnt lik in me i ...
157                             i m leaving my house now 
Name: SMS, dtype: object

### It maybe a good idea to get used to NLTK library for natural language recognition

In [11]:
X_train = X_train.str.split(" ")

In [12]:
X_train.head()

1642    [hi, where, are, you, we, re, at, and, they, r...
2899    [if, you, r, home, then, come, down, within, 5...
480     [when, re, you, guys, getting, back, g, said, ...
3485    [tell, my, bad, character, which, u, dnt, lik,...
157                     [i, m, leaving, my, house, now, ]
Name: SMS, dtype: object

In [13]:
words_list = []

for row in X_train.tolist():
    words_list.extend(row)
    
print('Total words: {}'.format(len(words_list)))
    
vocabulary = list(set(words_list))

print("Unique words: {}".format(len(vocabulary)))

Total words: 74513
Unique words: 7747


In [14]:
word_counts_per_sms = {
    unique_word: [0] * len(X_train) for unique_word in vocabulary
}

for index, sms in enumerate(X_train):
    for word in sms:
        word_counts_per_sms[word][index] += 1

IDEA: Remove all columns containing numbers

In [15]:
word_counts = pd.DataFrame(word_counts_per_sms, index=X_train.index)
word_counts.head()

Unnamed: 0,Unnamed: 1,0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,...,zealand,zebra,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada
1642,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2899,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3485,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
157,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
final_df = pd.concat([y_train, word_counts], axis=1)
final_df.head()

Unnamed: 0,Label,Unnamed: 2,0,00,000,008704050406,0121,01223585236,01223585334,0125698789,...,zealand,zebra,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada
1642,ham,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2899,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
480,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3485,ham,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
157,ham,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Create a function of Naive Bayes algorithm

\begin{equation}
P(Spam | w_1,w_2, ..., w_n) \propto P(Spam) \cdot \prod_{i=1}^{n}P(w_i|Spam) \\
P(Ham | w_1,w_2, ..., w_n) \propto P(Ham) \cdot \prod_{i=1}^{n}P(w_i|Ham)
\end{equation}

\begin{equation}
P(w_i|Spam) = \frac{N_{w_i|Spam} + \alpha}{N_{Spam} + \alpha \cdot N_{Vocabulary}} \\
P(w_i|Ham) = \frac{N_{w_i|Ham} + \alpha}{N_{Ham} + \alpha \cdot N_{Vocabulary}}
\end{equation}

In [17]:
p_ham, p_spam = y_train.value_counts(normalize=True).values
alpha = 1
print(p_ham, p_spam)

0.8653803006506618 0.13461969934933812


In [18]:
words_in_spam_message = X_train[y_train=='spam'].apply(len)
total_words_in_spam = words_in_spam_message.sum()
total_words_in_spam

15463

In [19]:
words_in_ham_message = X_train[y_train=='ham'].apply(len)
total_words_in_ham = words_in_ham_message.sum()
total_words_in_ham

59050

In [20]:
total_words = len(vocabulary)
total_words

7747

In [21]:
# Create vocabularies for spam and ham
params_spam = {word:0 for word in vocabulary}
params_ham = {word:0 for word in vocabulary}

In [22]:
spam = final_df[final_df.Label=='spam']
ham = final_df[final_df.Label=='ham']

In [23]:
for word in vocabulary:
    n_words_given_spam = spam[word].sum()
    p_word_given_spam = (n_words_given_spam + alpha) / (total_words_in_spam + alpha*total_words)
    params_spam[word] = p_word_given_spam
    
    n_words_given_ham = ham[word].sum()
    p_word_given_ham = (n_words_given_ham + alpha) / (total_words_in_ham + alpha*total_words)
    params_ham[word] = p_word_given_ham

### Classify new message

In [24]:
import re
import pdb

In [29]:
def classify(message):
#     pdb.set_trace()
    message = re.sub("\W", " ", message)
    message = re.sub("\s+", " ", message)
    message = message.lower().split(" ")

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in params_spam:
            p_spam_given_message *= params_spam[word]
        if word in params_ham:
            p_ham_given_message *= params_ham[word]
            
    print("P(spam|message)={}".format(p_spam_given_message))
    print("P(ham|message)={}".format(p_ham_given_message))
    
    if p_spam_given_message > p_ham_given_message:
        print("It's a spam.")
    elif p_spam_given_message < p_ham_given_message:
        print("It's a ham.")
    else:
        print("Let human decide.")

In [30]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(spam|message)=9.649596494455047e-28
P(ham|message)=7.135044728292059e-29
It's a spam.


In [31]:
classify("Sounds good, Tom, then see u there")

P(spam|message)=4.519761185634337e-25
P(ham|message)=2.6761048427485217e-21
It's a ham.


# Test on test data

In [36]:
from sklearn.metrics import precision_score

In [48]:
def classiffier(message):
    message = re.sub("\W", " ", message)
    message = re.sub("\s+", " ", message)
    message = message.lower().split(" ")

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in params_spam:
            p_spam_given_message *= params_spam[word]
        if word in params_ham:
            p_ham_given_message *= params_ham[word]
    
    if p_spam_given_message > p_ham_given_message:
        return 'spam'
    elif p_spam_given_message < p_ham_given_message:
        return 'ham'
    else:
        return 'ham'

In [49]:
predicted  = X_test.apply(classiffier)
predicted.head()

1078    ham
4028    ham
958     ham
4642    ham
4674    ham
Name: SMS, dtype: object

In [60]:
correct = (y_test == predicted).sum()
incorrect = (y_test != predicted).sum()
accuracy = correct / y_test.shape[0]

print("Correct: %d" % correct)
print("Incorrect: %d" % incorrect)
print("Accuracy: %.5f" % accuracy)

Correct: 1105
Incorrect: 10
Accuracy: 0.99103


Few further steps:

- Isolate the 10 messages that were classified incorrectly and try to figure out why the algorithm reached the wrong conclusions.
- Make the filtering process more complex by making the algorithm sensitive to letter case.