# Naive Bayes spam filtering

###### see pdf file for spam email filter example

In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

NLTK {Natural Language Toolkit} is a set of libraries for Natural Language Processing {NLP}

In [130]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\strck\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Stop words are the most common words in a language which don't carry much information. We will filter them before NLP.

In [131]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:5])

['i', 'me', 'my', 'myself', 'we']


A word can have many variation with the same meaning. We will use stem package to normalize the words.

In [132]:
from nltk.stem import PorterStemmer
Ps = PorterStemmer()
Ps.stem('cook'), Ps.stem('cooking') #returning the root word

('cook', 'cook')

We also need to remove punctuations, they are not informative in our classification

In [133]:
import string
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Load the data:

In [134]:
data = pd.read_csv('spam.csv')

In [135]:
data.head()

Unnamed: 0,Class,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Change categorical data into numbers

In [136]:
pd.get_dummies(data.Class)

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5164,0,1
5165,1,0
5166,1,0
5167,1,0


We are going to drop the first column of the dummies df because all we care about is detecting spam

In [137]:
#dropping the first column of the above df and then assign the spam column to our df
data['class_code'] = pd.get_dummies(data.Class, drop_first=True)

In [138]:
data.head()

Unnamed: 0,Class,Text,class_code
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [139]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5169 entries, 0 to 5168
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Class       5169 non-null   object
 1   Text        5169 non-null   object
 2   class_code  5169 non-null   uint8 
dtypes: object(2), uint8(1)
memory usage: 85.9+ KB


In [140]:
def train_test_split(dataframe, test_size=0.3, rs=None):
    """
    A function which takes pandas dataframe and splits it into a train and test samples. The rs=None gives us a random seed 
    each time. If you want to replicate the results then this must be set to a value otherwise the results will be different
    each time."""
    
    dataframe_test = dataframe.sample(frac=test_size, random_state=rs) #using the sample function to take a sample from the passed df
    dataframe_train = dataframe.loc[dataframe.index.difference(dataframe_test.index)]
    
    #must address the order of indicies in the train and test df
    return(dataframe_train.reset_index(drop=True), dataframe_test.reset_index(drop=True))

In [141]:
df_train, df_test = train_test_split(data, rs=4)

In [142]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3618 entries, 0 to 3617
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Class       3618 non-null   object
 1   Text        3618 non-null   object
 2   class_code  3618 non-null   uint8 
dtypes: object(2), uint8(1)
memory usage: 60.2+ KB


In [143]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1551 entries, 0 to 1550
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Class       1551 non-null   object
 1   Text        1551 non-null   object
 2   class_code  1551 non-null   uint8 
dtypes: object(2), uint8(1)
memory usage: 25.9+ KB


Cleaning one of the text messages as an example

In [144]:
message = df_train.Text[46]
print(message)

Your gonna have to pick up a $1 burger for yourself on your way home. I can't even move. Pain is killing me.


In [145]:
# convert to lower case
message = message.lower()

In [146]:
# remove all punctuation
message = "".join([x for x in message if x not in punctuations]) #the join method puts it back in sentence form
message

'your gonna have to pick up a 1 burger for yourself on your way home i cant even move pain is killing me'

In [147]:
message = [x for x in message.split() if x not in stopwords]
print(message)

['gonna', 'pick', '1', 'burger', 'way', 'home', 'cant', 'even', 'move', 'pain', 'killing']


In [148]:
message=[Ps.stem(x) for x in message]
print(message) # compare output with previous line

['gonna', 'pick', '1', 'burger', 'way', 'home', 'cant', 'even', 'move', 'pain', 'kill']


In [149]:
print(Counter(message))

Counter({'gonna': 1, 'pick': 1, '1': 1, 'burger': 1, 'way': 1, 'home': 1, 'cant': 1, 'even': 1, 'move': 1, 'pain': 1, 'kill': 1})


Now put them together in a function to perform the cleanup on every line in the df

In [150]:
def clean_message(message):
    """A function to clean the message and return a dictionary with bag of their occurance rate."""
    message = message.lower()
    message = "".join([x for x in message if x not in punctuations])
    message = [x for x in message.split() if x not in stopwords]
    message=[Ps.stem(x) for x in message]
    return(Counter(message))

In [151]:
print(df_train.Text[80])
print(clean_message(df_train.Text[80]))

What is the plural of the noun research?
Counter({'plural': 1, 'noun': 1, 'research': 1})


Apply the function to entire training data set

In [152]:
df_train['bag_of_words'] = df_train['Text'].apply(clean_message) # apply method applies our function to every row in the Text column of df_train
df_train.head()

Unnamed: 0,Class,Text,class_code,bag_of_words
0,ham,"Go until jurong point, crazy.. Available only ...",0,"{'go': 1, 'jurong': 1, 'point': 1, 'crazi': 1,..."
1,ham,Ok lar... Joking wif u oni...,0,"{'ok': 1, 'lar': 1, 'joke': 1, 'wif': 1, 'u': ..."
2,ham,U dun say so early hor... U c already then say...,0,"{'u': 2, 'dun': 1, 'say': 2, 'earli': 1, 'hor'..."
3,spam,FreeMsg Hey there darling it's been 3 week's n...,1,"{'freemsg': 1, 'hey': 1, 'darl': 1, '3': 1, 'w..."
4,ham,Even my brother is not like to speak with me. ...,0,"{'even': 1, 'brother': 1, 'like': 2, 'speak': ..."


In [153]:
bows = df_train['bag_of_words']

In [154]:
bows

0       {'go': 1, 'jurong': 1, 'point': 1, 'crazi': 1,...
1       {'ok': 1, 'lar': 1, 'joke': 1, 'wif': 1, 'u': ...
2       {'u': 2, 'dun': 1, 'say': 2, 'earli': 1, 'hor'...
3       {'freemsg': 1, 'hey': 1, 'darl': 1, '3': 1, 'w...
4       {'even': 1, 'brother': 1, 'like': 2, 'speak': ...
                              ...                        
3613    {'ard': 1, '6': 1, 'like': 1, 'dat': 1, 'lor': 1}
3614    {'remind': 1, 'o2': 1, 'get': 1, '250': 1, 'po...
3615    {'2nd': 1, 'time': 1, 'tri': 1, '2': 2, 'conta...
3616     {'piti': 1, 'mood': 1, 'soani': 1, 'suggest': 1}
3617                    {'rofl': 1, 'true': 1, 'name': 1}
Name: bag_of_words, Length: 3618, dtype: object

In [155]:
# only using the training data
# get ham bag_of_words rows i.e. where class_code == 0
bows_ham = df_train[df_train['class_code']==0].bag_of_words
print(bows_ham)

# repeat for spam i.e. class_code == 1
bows_spam = df_train[df_train['class_code']==1].bag_of_words

0       {'go': 1, 'jurong': 1, 'point': 1, 'crazi': 1,...
1       {'ok': 1, 'lar': 1, 'joke': 1, 'wif': 1, 'u': ...
2       {'u': 2, 'dun': 1, 'say': 2, 'earli': 1, 'hor'...
4       {'even': 1, 'brother': 1, 'like': 2, 'speak': ...
5       {'per': 1, 'request': 1, 'mell': 2, 'oru': 1, ...
                              ...                        
3611    {'arent': 1, 'next': 1, 'ltgt': 1, 'hour': 1, ...
3612    {'get': 1, 'dump': 1, 'heap': 1, 'mom': 1, 'de...
3613    {'ard': 1, '6': 1, 'like': 1, 'dat': 1, 'lor': 1}
3616     {'piti': 1, 'mood': 1, 'soani': 1, 'suggest': 1}
3617                    {'rofl': 1, 'true': 1, 'name': 1}
Name: bag_of_words, Length: 3144, dtype: object


In [156]:
# generate a list of all the words in the bag_of_words column
words = list(set().union(*bows))

In [157]:
freq_ham = {key: 1 for key in words}


for word in words: #iterate over every word in the words list
    for bow in bows_ham: #iterate over key:value pair in bows_ham
        if word in bow.keys(): #if the word is a keys for bows_ham 
            freq_ham[word] += bow[word] # adds the bows_ham key (count) to the initialized freq_ham key (count)

In [158]:
freq_ham['flea']

2

In [159]:
freq_spam = {key: 1 for key in words}

for word in words:
    for bow in bows_spam:
        if word in bow.keys():
            freq_spam[word] += bow[word]

In [160]:
freq_spam['free']

143

Probability of a word given that the text is ham or spam (likelihood)

In [161]:
P_word_h = {}
P_word_s = {}

for key in freq_ham:
    P_word_h[key] = freq_ham[key] / sum(freq_ham.values())
    
for key in freq_spam:
    P_word_s[key] = freq_spam[key] / sum(freq_spam.values())
    
max(P_word_h.values()) # figure out how to find the corresponding key

0.018320328162107147

Finding the priors

In [162]:
P_h = bows_ham.size / bows.size
P_s = bows_spam.size / bows.size
print(P_h, P_s)

0.8689883913764511 0.1310116086235489


Define the main classifier function

In [163]:
def classifier_nb(doc):
    """A function that performs a naive bayes classification for spam vs ham emails."""
    #create bag of words
    doc_bag_of_words = clean_message(doc)
    
    #initialize posterior
    P_doc_h = 1
    P_doc_s = 1
    
    #likelihood
    for key in doc_bag_of_words:
        if key in words:
            P_doc_h = P_doc_h * P_word_h[key]
            P_doc_s = P_doc_s * P_word_s[key]
            
    #multipliying by prior to get unnormalized posterior
    P_doc_h = P_doc_h * P_h
    P_doc_s = P_doc_s * P_s
    
    #normalize the posterior
    P_doc_h_normalized = P_doc_h / (P_doc_h + P_doc_s)
    
    if P_doc_h_normalized > 0.5:
        return(0)
    else:
        return(1)

classifier_nb = np.vectorize(classifier_nb)

In [164]:
classifier_nb('Congratulations! You won $500.')

array(1)

In [165]:
classifier_nb("Let's apply this model to the test sample")

array(0)

In [166]:
predictions = classifier_nb(df_test['Text'].values)

In [167]:
# defining the truth
T = df_test['class_code']

In [168]:
TP, TN, FP, FN = 0,0,0,0

for i in range(len(T)):
    if T[i] == 1:
        if predictions[i] == 1:
            TP += 1
        if predictions[i] == 0:
            FN += 1
    elif T[i] == 0:
        if predictions[i] == 1:
            FP += 1
        if predictions[i] == 0:
            TN += 1

Confusion matrix

In [170]:
print(np.array([[TP, FP], [FN, TN]]))

[[ 158    9]
 [  21 1363]]


In [179]:
# calculate precision and recall/sensitivity
precision = TP / (TP + FP)
recall = TP / (TP + FN)
#print("precison {s}".format(precision)) learn how to print like this
print("precision:", precision)
print("recall:", recall)

precision: 0.9461077844311377
recall: 0.88268156424581


In [177]:
F1_score = 2*precision*recall / (precision+recall)
print("F1_score:", F1_score)

F1_score: 0.9132947976878613
