# Spam Classification

## `1.` Processing the Data

In [1]:
#importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [3]:
# reading the dataset
df = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['Label','SMS'])

In [4]:
# 5500 total examples
df.shape

(5572, 2)

In [5]:
# displaying first few rows
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# dataset is imbalanced, ratio of ham to spam is 7:1
df["Label"].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [8]:
# shuffling and then splitting into test and train in 20:80 ratio
df=df.sample(frac=1, random_state=1)
train = df.iloc[:4457,:].reset_index()
test = df.iloc[4457:,:].reset_index()

In [9]:
# first few rows of input data
train.head()

Unnamed: 0,index,Label,SMS
0,1078,ham,"Yep, by the pretty sculpture"
1,4028,ham,"Yes, princess. Are you going to make me moan?"
2,958,ham,Welp apparently he retired
3,4642,ham,Havent.
4,4674,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [10]:
# percentage of ham and spam in train
train["Label"].value_counts(normalize=True)

ham     0.86538
spam    0.13462
Name: Label, dtype: float64

In [11]:
# percentage of ham and spam in test
test["Label"].value_counts(normalize=True)

ham     0.868161
spam    0.131839
Name: Label, dtype: float64

In [12]:
# processing SMS column and converting it to list
train["SMS"] = train["SMS"].str.replace('\W', ' ')
train["SMS"] = train["SMS"].str.lower()
train["SMS"]=train["SMS"].str.split()

In [16]:
train["SMS"].head()

0                    [yep, by, the, pretty, sculpture]
1    [yes, princess, are, you, going, to, make, me,...
2                      [welp, apparently, he, retired]
3                                             [havent]
4    [i, forgot, 2, ask, ü, all, smth, there, s, a,...
Name: SMS, dtype: object

In [17]:
# generating a vocabulary of all the words in the dataset
vocabulary = []
for i in train["SMS"].values:
    for j in i:
        vocabulary.append(j)
        
vocabulary=set(vocabulary)
vocabulary=list(vocabulary)

In [19]:
# generating a column for each word in the vocabulary and forming a new DataFrame
word_counts_per_sms = {unique_word: [0] * len(train['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(train['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1
        
word_count=pd.DataFrame(word_counts_per_sms)

In [21]:
# the resulting dataframe has 7782 columns
word_count.head()

Unnamed: 0,0,00,000,000pes,008704050406,0089,01223585334,02,0207,02072069400,...,zindgi,zoe,zogtorius,zouk,zyada,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [22]:
# concatenating the original training data with this dataframe to form final training data
final_train=pd.concat([train, word_count])

## `2.` Building the Model

In [23]:
# prior probability for ham and spam
p_ham=final_train["Label"].value_counts(normalize=True)["ham"]
p_spam=1-p_ham

# vocabulary length
n_vocabulary = len(vocabulary)
alpha = 1 # for Laplace smoothing

In [None]:
temp = final_train.loc[final_train["Label"]=='spam','SMS']
k= temp.apply(lambda x: len(x))
n_spam = k.sum() # number of sentences in spam

In [None]:
temp_ham = final_train.loc[final_train["Label"]=='ham']
temp_spam = final_train.loc[final_train["Label"]=='spam']
k= temp_ham['SMS'].apply(lambda x: len(x))
n_ham = k.sum() # number of sentences in ham 

In [None]:
spam = {unique_word:0 for unique_word in vocabulary}
ham  = {unique_word:0 for unique_word in vocabulary}

In [None]:
# list_sms_spam = spam_train["SMS"].to_list()
# flatten_sms_spam = lambda t: [item for sublist in list_sms_spam for item in sublist]

In [None]:
for word in vocabulary:
    
    n_word_given_spam=temp_spam[word].sum() # number of times that particular word has appeared in spam messages
    n_word_given_ham=temp_ham[word].sum() # number of times that particular word has appeared in ham messages   
    p_word_given_spam=(n_word_given_spam+alpha)/(n_spam+alpha*n_vocabulary) # probability of that word appearing given it is a spam
    p_word_given_ham=(n_word_given_ham+alpha)/(n_ham+alpha*n_vocabulary) # probability of that word appearing given it is a ham
    spam[word]=p_word_given_spam # storing spam probability for each word in vocabulary
    ham[word]=p_word_given_ham # storing ham probability for each word in vocabulary

In [None]:
def classify(message):
    
    '''
    function to classify the prediction as spam or ham 
    input : a string message
    output : whether it is spam or not
    '''
    
    # applying same processing steps as with the training data
    message = re.sub('\W', ' ', message) # replacing everything that is not a word or digit with space
    message = message.lower()
    message = message.split() 
    
    p_words_product_spam=1
    p_words_product_ham=1
    
    
    for word in message:
        if(word in vocabulary):
            '''
            applying bayes rule, this is called naive because - given the message is spam,
            the conditional probability of a word being present in a sentence is considered 
            to be independent of other words in the sentence.
            '''
            p_words_product_spam=spam[word]*p_words_product_spam # multiplication rule of probability with naive bayes rule applied
            p_words_product_ham=ham[word]*p_words_product_ham
        
   
    # ignoring the denominator as it will be same for probability of spam given a message and probability of ham given a message
    p_spam_given_message = p_words_product_spam*p_spam 
    p_ham_given_message = p_words_product_ham*p_ham
    

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message >= p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    

In [None]:
# checking a sample sentence
classify('WINNER!! This is the secret code to unlock the money: C3421.')

In [None]:
# checking a sample sentence
classify('"Sounds good, Tom, then see u there"')

In [None]:
# applying predictions on the test set
test['predicted'] = test['SMS'].apply(classify_test_set)
test.head()