# Spam Filtering


### Importing libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [3]:
data = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['Label', 'SMS'])

In [4]:
data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.shape

(5572, 2)

Check for null values


In [6]:
data.isnull().sum()

Label    0
SMS      0
dtype: int64

In [7]:
data['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [8]:
data['Label'].value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

From here, we can say that about 87% of the messages are non-spam(ham) and the remaining ones are spam.

## Splitting the data

In [9]:
random_data = data.sample(frac=1, random_state=1)

In [10]:
train_size = 0.8
test_size = 0.2

train_data, test_data = train_test_split(random_data, test_size=test_size, train_size=train_size, random_state=1)

In [11]:
print(train_data.shape)
print(test_data.shape)

(4457, 2)
(1115, 2)


In [12]:
train_data['Label'].value_counts(normalize = True)

ham     0.866951
spam    0.133049
Name: Label, dtype: float64

In [13]:
test_data['Label'].value_counts(normalize = True)

ham     0.861883
spam    0.138117
Name: Label, dtype: float64

## Data cleaning

Changing the case to lower and removing the punctuation marks.

In [14]:
train_data['SMS'] = train_data['SMS'].str.replace('\W', ' ') 
train_data['SMS'] = train_data['SMS'].str.lower()

  train_data['SMS'] = train_data['SMS'].str.replace('\W', ' ')


In [15]:
train_data.head()

Unnamed: 0,Label,SMS
1673,spam,urgent we are trying to contact u todays dra...
4369,ham,1 i don t have her number and 2 its gonna be a...
4540,ham,party s at my place at usf no charge but if ...
2996,ham,mm not entirely sure i understood that text bu...
1814,ham,yes we are chatting too


In [16]:
train_data = train_data.reset_index(drop = True)

In [17]:
test_data = test_data.reset_index(drop = True)


Now we need to create a list with all the unique words in our training set.

In [18]:
train_data['SMS'] = train_data['SMS'].str.split()
print(train_data['SMS'])

0       [urgent, we, are, trying, to, contact, u, toda...
1       [1, i, don, t, have, her, number, and, 2, its,...
2       [party, s, at, my, place, at, usf, no, charge,...
3       [mm, not, entirely, sure, i, understood, that,...
4                           [yes, we, are, chatting, too]
                              ...                        
4452    [i, am, not, sure, about, night, menu, i, know...
4453    [i, like, dis, sweater, fr, mango, but, no, mo...
4454    [wife, how, she, knew, the, time, of, murder, ...
4455    [thanx, u, darlin, im, cool, thanx, a, few, bd...
4456    [yeah, there, s, quite, a, bit, left, i, ll, s...
Name: SMS, Length: 4457, dtype: object


In [19]:
words = set()
for msg in train_data['SMS'] :
    for word in msg :
        words.add(word)
print(words)

{'upto', 'lobby', 'green', 'dictionary', 'nigeria', '2getha', 'beauty', 'chart', 'dangerous', 'lotto', 'cup', 'bbdeluxe', 'takin', 'xxxxx', 'radio', 'ipad', 'youre', 'tickets', 'panasonic', 'directors', 'ore', 'hearing', 'mad1', 'royal', 'trav', 'star', 'thrown', 'rayan', 'loo', 'everytime', 'winterstone', 'allowed', 'send', 'safe', '2wu', '09065394514', '2docd', '1205', 'tension', 'sips', 'browsin', 'clas', 'conference', 'nig', 'miracle', 'atlast', '000pes', '4403ldnw1a7rw18', 'mudyadhu', 'gn', 'wrnog', 'shakespeare', '4wrd', '09061104283', 'ps3', 'ringtone', 'necesity', 'amla', '89938', 'purchase', 'wud', 'wihtuot', 'lrg', 'lift', 'xuhui', 'cell', '1680', 'irritating', 'kerala', 'favourite', 'unni', 'ternal', 'you', 'lays', '85222', 'gei', 'wikipedia', 'dis', 'manky', 'hourish', 'compulsory', 'cool', 'ujhhhhhhh', 'jolly', 'teasing', '2rcv', 'appendix', 'avenue', 'pattern', 'bedreal', 'ticket', 'activate', 'off', 'bro', 'different', 'misbehaved', 'what', 'bought', 'chocolate', 'lackin




Number of unique words in train dataset

In [20]:
len(words)

7755

In [21]:
word_counts_per_sms = {unique_word: [0] * len(train_data['SMS']) for unique_word in words}
for index, sms in enumerate(train_data['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [22]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,upto,lobby,green,dictionary,nigeria,2getha,beauty,chart,dangerous,lotto,...,fav,real,phones,debating,bloomberg,chinchillas,dehydrated,maintain,had,hoping
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
final_train_data = pd.concat([train_data, word_counts], axis=1)
final_train_data.head()

Unnamed: 0,Label,SMS,upto,lobby,green,dictionary,nigeria,2getha,beauty,chart,...,fav,real,phones,debating,bloomberg,chinchillas,dehydrated,maintain,had,hoping
0,spam,"[urgent, we, are, trying, to, contact, u, toda...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[1, i, don, t, have, her, number, and, 2, its,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[party, s, at, my, place, at, usf, no, charge,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[mm, not, entirely, sure, i, understood, that,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[yes, we, are, chatting, too]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
spam = final_train_data[final_train_data['Label'] == 'spam']
ham = final_train_data[final_train_data['Label'] == 'ham']

Now, we need to calculate the probability of spam and ham messages,
i.e, P(spam) and P(ham)

In [25]:
p_spam = len(spam)/len(final_train_data)
p_ham = len(ham)/len(final_train_data)
print(p_spam, p_ham)

0.1330491361902625 0.8669508638097375


In [26]:
n_words_per_spam_message = spam['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

In [27]:
n_words_per_ham_message = ham['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

In [28]:
total_words = len(words)
alpha = 1

## Calculating the parameters

In [29]:
parameters_spam = {unique_word:0 for unique_word in words}
parameters_ham = {unique_word:0 for unique_word in words}

In [30]:
for word in words:
   n_word_given_spam = spam[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*total_words)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*total_words)
   parameters_ham[word] = p_word_given_ham


In [31]:
def classify(sms) :
    msg = re.sub('/W',' ',sms)
    msg = msg.lower().split()
    
    p_spam_msg = p_spam
    p_ham_msg = p_ham
    
    for word in msg :
        if word in parameters_spam :
            p_spam_msg = p_spam_msg * parameters_spam[word]
        if word in parameters_ham :
            p_ham_msg = p_ham_msg * parameters_ham[word]
    print( f'P(Spam|message) : {p_spam_msg}')
    print( f'P(Ham|message) : {p_ham_msg}')
    
    if p_spam_msg > p_ham_msg :
        print("Label :- Spam")
    elif p_spam_msg < p_ham_msg :
        print("Label :- Ham")
    else :
        print("A human has to classify as it have equal probabilities")

In [32]:
classify("Happy to hear form you Sush. See you soon.")

P(Spam|message) : 1.0709139805078056e-22
P(Ham|message) : 3.800727152627211e-19
Label :- Ham


In [33]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message) : 1.3965687857573679e-18
P(Ham|message) : 2.1041524592961919e-19
Label :- Spam


Now let's test the model with our test data

## Measuring the accuracy

In [34]:
def classify_test_data(sms) :
    msg = re.sub('/W',' ',sms)
    msg = msg.lower().split()
    
    p_spam_msg = p_spam
    p_ham_msg = p_ham
    
    for word in msg :
        if word in parameters_spam :
            p_spam_msg = p_spam_msg * parameters_spam[word]
        if word in parameters_ham :
            p_ham_msg = p_ham_msg * parameters_ham[word]
    #print( f'P(Spam|message) : {p_spam_msg}')
    #print( f'P(Ham|message) : {p_ham_msg}')
    
    if p_spam_msg > p_ham_msg :
        return 'spam'
    elif p_spam_msg < p_ham_msg :
        return 'ham'
    else :
        return 'need human to classify'

In [35]:
test_data['predicted'] = test_data['SMS'].apply(classify_test_data)
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Good night my dear.. Sleepwell&amp;Take care,ham
1,ham,Sen told that he is going to join his uncle fi...,ham
2,ham,Thank you baby! I cant wait to taste the real ...,ham
3,ham,When can ü come out?,ham
4,ham,No. Thank you. You've been wonderful,ham


We need to compare the predicted values with the actual values to measure how good our spam filter is with classifying new messages. 

Accuracy = number of correctly classified messages / total classified messages

In [36]:
crct = 0
total = len(test_data)

for row in test_data.iterrows():
   row = row[1]
   if row['Label'] == row['predicted']:
      crct += 1
print(f"Correct : {crct}")
print(f"Incorrect : {total-crct}")
print(f"Accuracy : {crct/total}")

Correct : 1092
Incorrect : 23
Accuracy : 0.979372197309417


Accuracy is close to 97.93%