In [1]:
import pandas as pd

In [2]:
sms_data = pd.read_csv('SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [3]:
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms_data.shape

(5572, 2)

In [5]:
sms_data.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


In [6]:
sms_data_clean = sms_data.copy()

In [7]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()

In [8]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()

In [9]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

In [10]:
sms_data_clean['SMS'].head()

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, don, t, think, he, goes, to, usf, he,...
Name: SMS, dtype: object

In [11]:
sms_data_clean['Label'].value_counts() / sms_data.shape[0] * 100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [12]:
train_data = sms_data_clean.sample(frac=0.8,random_state=1).reset_index(drop=True)
test_data = sms_data_clean.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [13]:
train_data['Label'].value_counts() / train_data.shape[0] * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [14]:
train_data.shape

(4458, 2)

In [15]:
test_data['Label'].value_counts() / test_data.shape[0] * 100

ham     86.983842
spam    13.016158
Name: Label, dtype: float64

In [16]:
test_data.shape

(1114, 2)

In [17]:
test_data.head()

Unnamed: 0,Label,SMS
0,ham,"[aight, should, i, just, plan, to, come, up, l..."
1,ham,"[die, i, accidentally, deleted, e, msg, i, sup..."
2,spam,"[welcome, to, uk, mobile, date, this, msg, is,..."
3,ham,"[this, is, wishing, you, a, great, day, moji, ..."
4,ham,"[thanks, again, for, your, reply, today, when,..."


In [18]:
vocabulary = list(set(train_data['SMS'].sum()))

In [None]:
vocabulary

In [20]:
len(vocabulary)

7783

In [21]:
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()], columns=vocabulary)

In [22]:
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

In [23]:
train_data.shape

(4458, 7785)

In [24]:
train_data.head()

Unnamed: 0,Label,SMS,ink,katexxx,welcome,diamond,youuuuu,scotland,7zs,muhommad,shola,dormitory,ello,works,amrita,flirt,gobi,papa,tv,waliking,aah,write,creative,type,smashed,secured,upload,child,ride,dudette,woot,whats,gonna,charlie,someone,kickboxing,motive,easy,download,marketing,...,help,bslvyl,seeking,telediscount,listening2the,havn,index,07090298926,dd,mutations,qbank,glorious,find,wendy,beauty,sum,listn,haf,fresh,pert,rajas,fb,reveal,small,seem,fighting,apply,08718720201,nasdaq,gynae,workin,support,screaming,envy,skip,gender,t,adding,foot,asjesus
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
train_data.to_csv('converted.csv')

In [25]:
alpha = 1

In [26]:
Nvoc = len(train_data.columns) - 3

In [27]:
Pspam = train_data['Label'].value_counts()['spam'] / train_data.shape[0]

In [28]:
Pham = train_data['Label'].value_counts()['ham'] / train_data.shape[0]

In [29]:
Nspam = train_data.loc[train_data['Label'] == 'spam', 'SMS'].apply(len).sum()

In [30]:
Nham = train_data.loc[train_data['Label'] == 'ham', 'SMS'].apply(len).sum()


In [31]:
def p_w_spam(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha) / (Nspam + alpha*Nvoc)
    else:
        return 1

In [32]:
def p_w_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum() + alpha) / (Nham + alpha*Nvoc)
    else:
        return 1

In [33]:
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    for word in message:
        p_spam_given_message *= p_w_spam(word)
        p_ham_given_message *= p_w_ham(word)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [34]:
classify('secret')

'ham'

In [35]:
classify(['secret', 'source', 'of', 'infinite', 'power'])

'ham'

In [36]:
test_data['predicted'] = test_data['SMS'].apply(classify)

In [37]:
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"[aight, should, i, just, plan, to, come, up, l...",ham
1,ham,"[die, i, accidentally, deleted, e, msg, i, sup...",ham
2,spam,"[welcome, to, uk, mobile, date, this, msg, is,...",spam
3,ham,"[this, is, wishing, you, a, great, day, moji, ...",ham
4,ham,"[thanks, again, for, your, reply, today, when,...",ham


In [38]:

correct = (test_data['predicted'] == test_data['Label']).sum() / test_data.shape[0] * 100

In [39]:
test_data.loc[test_data['predicted'] != test_data['Label']]

Unnamed: 0,Label,SMS,predicted
56,spam,"[money, i, have, won, wining, number, 946, wot...",ham
99,ham,"[gettin, rdy, to, ship, comp]",spam
142,ham,"[have, you, laid, your, airtel, line, to, rest]",spam
218,spam,"[hi, babe, its, chloe, how, r, u, i, was, smas...",ham
245,ham,[anytime],spam
404,ham,"[nokia, phone, is, lovly]",spam
491,spam,"[hi, this, is, amy, we, will, be, sending, you...",ham
588,ham,"[we, have, sent, jd, for, customer, service, c...",spam
646,ham,"[a, boy, loved, a, gal, he, propsd, bt, she, d...",needs human classification
912,spam,"[dating, i, have, had, two, of, these, only, s...",ham


In [40]:

correct

99.10233393177738

In [41]:
s='hello there'

In [42]:
classify(s)

'ham'

In [43]:
classify('aight should i smit bhatt ghcdk')

'ham'

In [44]:
classify('smit')

'ham'