In [2]:
import numpy as np 
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split

In [3]:
# load data and get some basic information about it...
df = pd.read_csv('SPAM text message 20170820 - Data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [3]:
# let's show how the dataset look like.
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
print('percentages of Category column categories: ')
print(df['Category'].value_counts(normalize=True)*100)

percentages of Category column categories: 
ham     86.593683
spam    13.406317
Name: Category, dtype: float64


<h1> Split Data into Train / Validation Datasets </h1>

In [5]:
x_train,x_test,y_train,y_test=train_test_split(df['Message'],df['Category'],
                                               test_size=0.2,random_state=42)
training_set=pd.concat([x_train,y_train],axis=1)
testing_set=pd.concat([x_test,y_test],axis=1)

In [6]:
print('training set shape: ',training_set.shape)
print('testing set shape: ',testing_set.shape)

training set shape:  (4457, 2)
testing set shape:  (1115, 2)


In [7]:
print('percentages of Category column categories in Training Dataset: \n',training_set['Category'].value_counts(normalize=True)*100)
print('percentages of Category column categories in Validation Dataset: \n',testing_set['Category'].value_counts(normalize=True)*100)

percentages of Category column categories in Training Dataset: 
 ham     86.582903
spam    13.417097
Name: Category, dtype: float64
percentages of Category column categories in Validation Dataset: 
 ham     86.636771
spam    13.363229
Name: Category, dtype: float64


<p>we notice here that training_set & testing_set have the same percentages of the original dataframe</p>

<h1> Text Processing and Data Cleaning </h1>

<p> let's now remove any non-word character and lowercasing the letters. </p>

In [8]:
training_set['Message'] = training_set['Message'].str.replace('\W', ' ')
testing_set['Message'] = testing_set['Message'].str.replace('\W', ' ')
training_set['Message'] = training_set['Message'].str.lower()
testing_set['Message'] = testing_set['Message'].str.lower()

  """Entry point for launching an IPython kernel.
  


<p> and also split the statments into words. </p>

In [9]:
training_set['Message']=training_set['Message'].str.split()

<p> now let's tokenize the statements, although you can use NLTK or Spacy to tokenize the statements but I am going to tokenize them from scratch. </p>

In [10]:
# 1- collecting all of the words in all messages....
vocab=[]
for lst in training_set['Message']:
    for i in lst:
        vocab.append(i)
vocab=list(set(vocab))

In [11]:
# 2- counting the number of words in each message and merge this into the training dataframe..
word_counts_per_sms={unique_word: [0]*len(training_set['Message']) for unique_word in vocab}
for idx, lst in enumerate(training_set['Message']):
    for word in lst:
        word_counts_per_sms[word][idx] +=1
word_counter=pd.DataFrame(word_counts_per_sms)
training_set_clean = pd.concat([training_set, word_counter], axis=1)
training_set_clean.head()

Unnamed: 0,Message,Category,visiting,mist,timing,flowing,spoilt,torch,jod,stream,...,msn,doggin,abta,use,07808726822,grace,railway,tnc,internet,birla
0,"[go, until, jurong, point, crazy, available, o...",ham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[ok, lar, joking, wif, u, oni]",ham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[free, entry, in, 2, a, wkly, comp, to, win, f...",spam,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[u, dun, say, so, early, hor, u, c, already, t...",ham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[nah, i, don, t, think, he, goes, to, usf, he,...",ham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h1> Naïve Bayes Algorithm </h1>

<img src = 'https://latex.codecogs.com/gif.latex?\boldsymbol{\mathbf{P(Spam%20|%20w_1,w_2,%20...,%20w_n)%20\propto%20P(Spam)%20\cdot%20\prod_{i=1}^{n}P(w_i|Spam)}}'>
<img src = 'https://latex.codecogs.com/gif.latex?\boldsymbol{\mathbf{P(Ham%20|%20w_1,w_2,%20...,%20w_n)%20\propto%20P(Ham)%20\cdot%20\prod_{i=1}^{n}P(w_i|Ham)}}'>

<p>our multinomial Naive Bayes algorithm will make the classification based on the results it gets to these two equations below, where "w1" is the first word, and w1,w2, ..., wn is the entire message .</p>
<p>If P(Spam | w1,w2, ..., wn) > P(Ham | w1,w2, ..., wn), then the message is spam.</p>
<p>To calculate P(wi|Spam) and P(wi|Ham)</p>
<img src = 'https://latex.codecogs.com/gif.latex?\boldsymbol{\mathbf{P(w_i|Spam)%20=%20\frac{N_{w_i|Spam}%20+%20\alpha}{N_{Spam}%20+%20\alpha%20\cdot%20N_{Vocabulary}}}}'>
<img src = 'https://latex.codecogs.com/gif.latex?\boldsymbol{\mathbf{P(w_i|Ham)%20=%20\frac{N_{w_i|Ham}%20+%20\alpha}{N_{Ham}%20+%20\alpha%20\cdot%20N_{Vocabulary}}}}'>
<p> N(W(i) | Spam): the number of times the word W(i) occurs in spam message. </p>
<p> N(W(i) | Ham): the number of times the word W(i) occurs in Ham message. </p>
<p> N(Spam): number of words in Spam messages.</p> 
<p> N(Ham): number of words in Ham messages.</p> 
<p> N(vocab): number of words in vocabulary.</p>
<p> alpha: equal to 1 called smothing parameter.</p>

In [12]:
# 1- what is the probability of Spam and Ham messages in Training/Testing Dataset
spam_length = len(training_set[training_set['Category'] == 'spam'])
ham_length = len(training_set[training_set['Category'] == 'ham'])
print('number of spam messages in Training DataFrame: ', spam_length)
print('number of ham messages in Training DataFrame: ', ham_length)
p_spam = spam_length / len(training_set)
p_ham = ham_length / len(training_set)
print('probability of spam messages in Training DataFrame: ', p_spam)
print('probability of ham messages in Training DataFrame: ', p_ham)

number of spam messages in Training DataFrame:  598
number of ham messages in Training DataFrame:  3859
probability of spam messages in Training DataFrame:  0.13417096701817366
probability of ham messages in Training DataFrame:  0.8658290329818263


In [13]:
# 2- what is the value of the liklihood probability of message when Spam/Ham is True
n_words_spam_message = training_set[training_set['Category'] == 'spam']['Message'].apply(len)
n_words_ham_message = training_set[training_set['Category'] == 'ham']['Message'].apply(len)
#N(spam)
n_spam = n_words_spam_message.sum()
#N(Ham)
n_ham = n_words_ham_message.sum()
##
spam_dict={word:0 for word in vocab}
ham_dict={word:0 for word in vocab}
alpha = 1
for word in vocab:
    # N(W(i) | Spam)
    n_word_given_spam = training_set_clean[training_set_clean['Category'] == 'spam'][word].sum()
    # P(W(i) | Spam)
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * len(vocab))
    spam_dict[word] = p_word_given_spam
    n_word_given_ham = training_set_clean[training_set_clean['Category'] == 'ham'][word].sum()
    # P(W(i) | Ham)
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha * len(vocab))
    ham_dict[word] = p_word_given_ham

In [14]:
# now let's write a function to classify messages based on the probability of word given spam/ham
def bayes_filter(message):
    # text processing
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    # set p_spam_given_word / set_ham_given_word equal to p spam and p ham
    p_spam_given_word = p_spam
    p_ham_given_word = p_ham
    for word in message:
        if word in spam_dict.keys() or word in ham_dict.keys():
            p_spam_given_word *= spam_dict[word]
            p_ham_given_word *= ham_dict[word]
    print('P(Spam | W) = ', p_spam_given_word)
    print('P(Ham | W) = ', p_ham_given_word)
    if p_spam_given_word > p_ham_given_word:
        print('Category: Spam')
    elif p_spam_given_word < p_ham_given_word:
        print('Category: Ham')
    else:
        print('Maybe one of Them.')

In [15]:
bayes_filter('U dun say so early hor... U c already then say')

P(Spam | W) =  5.285960360968486e-37
P(Ham | W) =  1.7090184267541062e-33
Category: Ham


In [16]:
bayes_filter("Sounds good, Tom, then see u there")

P(Spam | W) =  7.225048739561725e-25
P(Ham | W) =  8.50585359063866e-22
Category: Ham


In [17]:
def bayes_classifier(message):
    # text processing
    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()
    # set p_spam_given_word / set_ham_given_word equal to p spam and p ham
    p_spam_given_word = p_spam
    p_ham_given_word = p_ham
    for word in message:
        if word in spam_dict.keys() or word in ham_dict.keys():
            p_spam_given_word *= spam_dict[word]
            p_ham_given_word *= ham_dict[word]
    if p_spam_given_word > p_ham_given_word:
        return 'spam'
    elif p_spam_given_word < p_ham_given_word:
        return 'ham'
    else:
        return 'maybe both'

In [18]:
testing_set['predicted'] = testing_set['Message'].apply(bayes_classifier)
testing_set.head()

Unnamed: 0,Message,Category,predicted
3245,squeeeeeze this is christmas hug if u lik ...,ham,ham
944,and also i ve sorta blown him off a couple tim...,ham,ham
1044,mmm thats better now i got a roast down me i ...,ham,ham
2484,mm have some kanji dont eat anything heavy ok,ham,ham
812,so there s a ring that comes with the guys cos...,ham,ham


In [19]:
correct = 0
total = testing_set.shape[0]
    
for row in testing_set.iterrows():
    row = row[1]
    if row['Category'] == row['predicted']:
        correct += 1
        
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 964
Incorrect: 151
Accuracy: 0.8645739910313901
