# Spam classification from SMS.

This project consists in creating a filter for spam SMS.

In [30]:
import numpy as np
import pandas as pd
import re
%matplotlib inline


In [31]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header = None, names = ['Label', 'SMS'])
df

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [32]:
df.shape

(5572, 2)

In [33]:
#Percentage of SMS that are spam vs non spam
df['Label'].value_counts(normalize = True)*100, 2

(ham     86.593683
 spam    13.406317
 Name: Label, dtype: float64,
 2)

In [34]:
dset = df.sample(frac= 1, random_state= 1)

In [35]:
#Generate a train dataset that accounts for 80% of our data
dtrain = dset.iloc[:round(len(dset)*0.8)]
dtrain.shape

(4458, 2)

In [36]:
#Generate a test dataset that accounts for 20% of our data
dtest = dset.iloc[round(len(dset)*0.8):]
dtest.shape

(1114, 2)

In [37]:
dtrain['Label'].value_counts(normalize = True)*100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [38]:
dtest['Label'].value_counts(normalize = True)*100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

In [39]:
dtrain['SMS'] = dtrain['SMS'].map(lambda x: re.sub('\W',' ',x))
dtrain['SMS'] = dtrain['SMS'].str.lower()
dtrain.reset_index(drop= True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [40]:
#Let's create a list with all the unique words in the SMSs
dtrain['SMS'] = dtrain['SMS'].str.split()
vocabulary = []
for item in dtrain['SMS']:
    for word in item:
        vocabulary.append(word)
#remove duplicates from vocabulary        
vocabulary = set(vocabulary)
vocabulary = list(vocabulary)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [41]:
#Create a dictionary to de number of times each word appears on each SMS
word_counts = {word:[0] * len(dtrain['SMS']) for word in vocabulary }


In [42]:
for index, sms in enumerate(dtrain['SMS']):
    for word in sms:
        word_counts[word][index] +=1

In [43]:
word_count = pd.DataFrame(word_counts)
word_count

Unnamed: 0,doit,bugis,doctor,cherthala,bck,straight,agalla,joined,issue,throat,...,limits,shes,wadebridge,jen,fraction,spelling,fifteen,09095350301,bat,emigrated
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
df_joined = pd.concat([dtrain, word_count ], axis = 1)
df_joined

Unnamed: 0,Label,SMS,doit,bugis,doctor,cherthala,bck,straight,agalla,joined,...,limits,shes,wadebridge,jen,fraction,spelling,fifteen,09095350301,bat,emigrated
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,ham,"[sorry, i, ll, call, later, in, meeting, any, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,ham,"[babe, i, fucking, love, you, too, you, know, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,spam,"[u, ve, been, selected, to, stay, in, 1, of, 2...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,ham,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
p_spam = round((df_joined['Label'].value_counts(normalize = True)*100)['spam'],3)
p_spam

13.459

In [46]:
p_ham = round((df_joined['Label'].value_counts(normalize = True)*100)['ham'], 3)
p_ham

86.541

In [47]:
Nspam = df_joined[df_joined['Label'] == 'spam'].loc[:,'joining':].sum().sum()
Nspam

12276

In [48]:
Nham = df_joined[df_joined['Label'] == 'ham'].loc[:,'joining':].sum().sum()
Nham

47556

In [49]:
Nvocabulary = len(vocabulary)
Nvocabulary

7783

In [50]:
#Laplace smoothing
alpha = 1

In [51]:
prob_spam = {word: 0 for word in vocabulary}


In [52]:
prob_ham = {word: 0 for word in vocabulary}

In [53]:
spam = df_joined[df_joined['Label'] == 'spam']
ham = df_joined[df_joined['Label'] == 'ham']

In [54]:
for word in vocabulary:
    n_word_given_spam = spam[word].sum()   # spam_messages already defined in a cell above
    p_word_given_spam = (n_word_given_spam + alpha) / (Nspam + alpha*Nvocabulary)
    prob_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham[word].sum()   # ham_messages already defined in a cell above
    p_word_given_ham = (n_word_given_ham + alpha) / (Nham + alpha*Nvocabulary)
    prob_ham[word] = p_word_given_ham

In [58]:
def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

   

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for word in message:
        if word in prob_spam:
            p_spam_given_message *= prob_spam[word]
        if word in prob_ham:
            p_ham_given_message *= prob_ham[word]

        else:
            continue
        

    #return ('P(Spam|message):', p_spam_given_message)
    #return('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return'needs human classification'

In [59]:
classify("Sounds good, Tom, then see u there")

'ham'

dtest['predicted'] = dtest['SMS'].apply(classify)
dtest

In [61]:
correct = 0
total = len(dtest)


1114

In [64]:
for index, row in dtest.iterrows():
    if row['Label'] == row['predicted']:
        correct += 1
    else:
        continue
accuracy = correct/total
accuracy

0.9883303411131059

This filter has 98.8% accuracy classifying the SMS from our test dataset. A good next step can be try to find why it failed on the SMS that failed