In [2]:
import numpy as np
import pandas as pd

In [3]:
sms_data = pd.read_csv('SMSSpamCollection' , sep = '\t' , header = None , names = ['Label' , 'SMS'])

In [4]:
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sms_data.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


In [6]:
sms_data_clean = sms_data.copy()

In [7]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()

In [8]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()

In [9]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

In [10]:
def train_test_split(data,test_size):
    data = data.sample(frac = 1)
    train_size = int((1-test_size)*len(data))
    train_data = data.iloc[:train_size].reset_index(drop=True)
    test_data = data.iloc[train_size:].reset_index(drop=True)
    return train_data,test_data

In [11]:
train_data,test_data = train_test_split(sms_data_clean,0.2)

In [12]:
(train_data['Label'].value_counts()/train_data.shape[0])*100

ham     86.60534
spam    13.39466
Name: Label, dtype: float64

In [13]:
vocabulary=list(set(train_data['SMS'].sum()))

In [14]:
len(vocabulary)

7767

In [15]:
len(train_data['SMS'].sum())

71633

In [16]:
bag_of_words_ham = list(train_data.loc[train_data['Label'] == 'ham','SMS'].sum())

In [17]:
len(bag_of_words_ham)

56346

In [18]:
bag_of_words_spam = list(train_data.loc[train_data['Label'] == 'spam','SMS'].sum())

In [19]:
len(bag_of_words_spam)

15287

## P(H='spam'|E= words in sentence)=
### P(H='spam') * P(E='you' and 'have' and 'won' and 'a' and 'lottery'|H='spam')
#### = P(H='spam') * P(E='you' |H='spam') * P(E='have' and 'won' and 'a' and 'lottery'|H='spam')
#### = P(H='spam') * P(E='you' |H='spam') * P(E='have' |H='spam') * P(E= 'won' | H='spam') * P(E='a' |H='spam') * P(E='lottery' |H='spam')

alpha = 1
total number of words in spam = N
#### P(E='you' |H='spam') = (number of 'you' in spam + aplha)/ (N + N * alpha  )

In [20]:
freq_ham = {}
freq_spam = {}
for word in vocabulary:
    freq_ham[word]=bag_of_words_ham.count(word)
    freq_spam[word]=bag_of_words_spam.count(word)

In [21]:
Pspam = train_data['Label'].value_counts()['spam'] / train_data.shape[0]
Pham = train_data['Label'].value_counts()['ham'] / train_data.shape[0]

In [22]:
Nvoc = len(bag_of_words_ham)+len(bag_of_words_spam)

In [28]:
alpha = 1

In [24]:
spam_count=train_data.loc[train_data['Label'] == 'spam','SMS'].count()
ham_count=train_data.loc[train_data['Label'] == 'ham','SMS'].count()

In [37]:
Nspam = train_data.loc[train_data['Label'] == 'spam', 'SMS'].apply(len).sum()
Nham = train_data.loc[train_data['Label'] == 'ham', 'SMS'].apply(len).sum()

In [29]:
def p_w_spam(word,freq_spam = freq_spam):
    if word in freq_spam.keys():
        return (freq_spam[word]+ alpha) / (Nspam + alpha*Nvoc)
    else:
        return 1

In [45]:
def p_w_ham(word,freq_ham = freq_ham):
    if word in freq_ham.keys():
        return (freq_ham[word]+ alpha) / (Nham + alpha*Nvoc)
    else:
        return 1

In [58]:
def classify(message):
    p_h_w = Pham
    for word in message:
        p_h_w = p_h_w*p_w_ham(word)
    p_s_w = Pspam
    for word in message:
        p_s_w = p_s_w*p_w_spam(word)
    if p_s_w >= p_h_w:
        return 'spam'
    else:
        return 'ham'

In [62]:
test_data['predicted'] = test_data['SMS'].apply(classify)

In [60]:
test_data.head(15)

Unnamed: 0,Label,SMS,predicted
0,ham,"[2, celebrate, my, b, day, y, else]",ham
1,spam,"[win, urgent, your, mobile, number, has, been,...",spam
2,ham,"[dunno, i, juz, askin, cos, i, got, a, card, g...",ham
3,ham,"[what, today, sunday, sunday, is, holiday, so,...",ham
4,spam,"[had, your, contract, mobile, 11, mnths, lates...",spam
5,ham,"[for, the, first, time, in, the, history, need...",ham
6,ham,"[but, that, s, on, ebay, it, might, be, less, ...",ham
7,ham,"[height, of, recycling, read, twice, people, s...",ham
8,ham,"[i, m, now, but, have, to, wait, till, 2, for,...",ham
9,ham,"[ela, kano, il, download, come, wen, ur, free]",ham


In [64]:
sum(test_data['predicted'] == test_data['Label'])/test_data.shape[0]

0.9650224215246637

In [65]:
test_data.loc[test_data['predicted']!=test_data['Label']]

Unnamed: 0,Label,SMS,predicted
52,spam,"[more, people, are, dogging, in, your, area, n...",ham
83,spam,"[you, are, being, contacted, by, our, dating, ...",ham
91,spam,"[bank, of, granite, issues, strong, buy, explo...",ham
103,spam,"[missed, call, alert, these, numbers, called, ...",ham
159,spam,"[please, call, 08712402779, immediately, as, t...",ham
196,spam,"[natalie, 20, f, is, inviting, you, to, be, he...",ham
229,spam,"[thanks, for, the, vote, now, sing, along, wit...",ham
233,spam,"[rct, thnq, adrian, for, u, text, rgds, vatian]",ham
251,spam,"[sms, ac, sun0819, posts, hello, you, seem, co...",ham
267,spam,"[please, call, 08712402902, immediately, as, t...",ham


In [67]:
len(test_data.loc[test_data['predicted']!=test_data['Label']])

39

In [68]:
test_data.shape[0]

1115