In [1]:
import numpy as np
import pandas as pd

In [2]:
sms_data=pd.read_csv('SMSSpamCollection', sep='\t', header=None,names=['label','sms'])

In [3]:
sms_data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms_data.groupby('label').count()

Unnamed: 0_level_0,sms
label,Unnamed: 1_level_1
ham,4825
spam,747


In [5]:
sms_data_clean=sms_data.copy()

In [6]:
sms_data_clean['sms'].replace(to_replace=r'\W+\s+', value=' ', regex=True)

0       Go until jurong point crazy Available only in ...
1                              Ok lar Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3          U dun say so early hor U c already then say...
4       Nah I don't think he goes to usf he lives arou...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity was in mood for that So...any other sugge...
5570    The guy did some bitching but I acted like i'd...
5571                            Rofl Its true to its name
Name: sms, Length: 5572, dtype: object

In [7]:
sms_data_clean['sms']=sms_data_clean['sms'].str.lower()

In [8]:
sms_data_clean['sms']=sms_data_clean['sms'].str.split()

In [9]:
sms_data_clean['sms']

0       [go, until, jurong, point,, crazy.., available...
1                    [ok, lar..., joking, wif, u, oni...]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor..., u, c, already...
4       [nah, i, don't, think, he, goes, to, usf,, he,...
                              ...                        
5567    [this, is, the, 2nd, time, we, have, tried, 2,...
5568        [will, ü, b, going, to, esplanade, fr, home?]
5569    [pity,, *, was, in, mood, for, that., so...any...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                    [rofl., its, true, to, its, name]
Name: sms, Length: 5572, dtype: object

In [10]:
def train_test_split(data,test_size,return_label=True):
    data=data.sample(frac=1)
    train_size = int((1-test_size)*len(data))
    train_data = data.iloc[:train_size].reset_index(drop=True)
    test_data = data.iloc[train_size:].reset_index(drop=True)
    return train_data,test_data

In [11]:
train_data,test_data=train_test_split(sms_data_clean,0.2)

In [12]:
vocab=list(set(train_data['sms'].sum()))

In [13]:
len(vocab)

11800

In [14]:
len(train_data['sms'].sum())

69507

In [15]:
bag_of_words_ham = list(train_data.loc[train_data['label'] == 'ham','sms'].sum())

In [16]:
len(bag_of_words_ham)

54945

In [17]:
bag_of_words_spam = list(train_data.loc[train_data['label'] == 'spam','sms'].sum())

In [18]:
len(bag_of_words_spam)

14562

In [None]:
freq_ham = {}
freq_spam = {}
for word in vocab:
    freq_ham[word]=bag_of_words_ham.count(word)
    freq_spam[word]=bag_of_words_spam.count(word)

In [26]:
values_ham_sum = sum(freq_ham.values())
values_spam_sum= sum(freq_spam.values())

In [28]:
prob_word={}
for key in freq_ham:
        prob_word[key]=freq_ham[key]/values_ham_sum

In [30]:
prob=values_ham_sum/(values_ham_sum+values_spam_sum)

In [31]:
prob

0.7904959212741163

In [32]:
spam_count=train_data.loc[train_data['label'] == 'spam']
ham_count=train_data.loc[train_data['label'] == 'ham']

In [33]:
len(spam_count)


609

In [34]:
len(ham_count)

3848

In [35]:
prob_spam=len(spam_count)/(len(spam_count)+len(ham_count))

In [36]:
prob_spam

0.1366389948395782

In [40]:
Pspam = train_data['label'].value_counts()['spam'] / train_data.shape[0]
Pham = train_data['label'].value_counts()['ham'] / train_data.shape[0]
nspam=train_data.loc[train_data['label'] == 'spam','sms'].apply(len).sum()
nham=train_data.loc[train_data['label'] == 'ham','sms'].apply(len).sum()

In [41]:
Pspam,Pham

(0.1366389948395782, 0.8633610051604218)

In [83]:
alpha=1
def p_w_spam(word,freq_spam=freq_spam):
    if word in freq_spam.keys():
        return (freq_spam[word])/(nspam)
    else:
        return 1

In [84]:
p_w_spam('you')

0.014146408460376321

In [85]:
def p_w_ham(word,freq_ham=freq_ham):
    if word in freq_ham.keys():
        return (freq_ham[word])/(nham)
    else:
        return 1

In [86]:
p_w_ham('you')

0.024733824733824735

In [87]:
strt='you have won a lottery';
to_prdt=strt.split()
for word in to_prdt:
    print(word)

you
have
won
a
lottery


In [88]:
def predict(sentence):
    p_spam_sen=Pspam
    p_ham_sen=Pham
    for word in sentence:
        p_spam_sen=p_spam_sen*p_w_spam(word)
        p_ham_sen=p_ham_sen*p_w_ham(word)
        if(p_ham_sen > p_spam_sen):
            return 'ham';
        else:
            return 'spam'

In [89]:
predict(strt)

'ham'

In [90]:
test_data['predicted'] = test_data['sms'].apply(predict)

In [91]:
test_data.head(15)

Unnamed: 0,label,sms,predicted
0,spam,"[xmas, iscoming, &, ur, awarded, either, £500,...",spam
1,spam,"[your, next, amazing, xxx, picsfree1, video, w...",ham
2,ham,"[jay, says, that, you're, a, double-faggot]",ham
3,ham,"[yes.he, have, good, crickiting, mind]",ham
4,ham,"[i, will, come, with, karnan, car., please, wa...",ham
5,ham,"[although, i, told, u, dat, i'm, into, baig, f...",ham
6,ham,"[just, curious, because, my, cuz, asked, what,...",ham
7,ham,"[you, intrepid, duo, you!, have, a, great, tim...",ham
8,ham,"[why, i, come, in, between, you, people]",ham
9,ham,"[k..k...from, tomorrow, onwards, started, ah?]",ham


In [92]:
sum(test_data['predicted'] == test_data['label'])/test_data.shape[0]

0.9103139013452914