In [1]:
import pandas as pd
import numpy as np
from collections import Counter as ctr

In [2]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
data = pd.read_csv('spam.csv', 
                 skiprows=1, 
                 names = ['type','text','c3','c4','c5'], 
                 delimiter = ',',
                 encoding = 'ISO-8859-1')
data.head()
## text is an entire email
## type is "spam" (bad) & "ham" (good)
# Shouldent we use a SVM?

Unnamed: 0,type,text,c3,c4,c5
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data['clean'] = data.text.apply(lambda x: x.lower().split()) 
## normalize it to lowercase and split on white space (to do tab put "\t")

In [5]:
data.drop(['c3', 'c4', 'c5'], axis = 1, inplace = True)
## axis is 0 is row, 1 is column

In [6]:
data.head()

Unnamed: 0,type,text,clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point,, crazy.., available..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar..., joking, wif, u, oni...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor..., u, c, already..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don't, think, he, goes, to, usf,, he,..."


In [7]:
data.groupby('type').count()

Unnamed: 0_level_0,text,clean
type,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4825,4825
spam,747,747


In [8]:
type_ctr = ctr(data.type)
print(" ham (good):", type_ctr['ham']/len(data),"\n",
      "spam (bad):", type_ctr['spam']/len(data)
)
## dont we need to balance the data?

 ham (good): 0.8659368269921034 
 spam (bad): 0.13406317300789664


In [9]:
## take note how to print two things from the same cell...
type_ctr['ham']/len(data), type_ctr['spam']/len(data)

(0.8659368269921034, 0.13406317300789664)

In [10]:
test = data.sample(frac = 0.1)
# train data is mutually exclusive from the test
train = data[~data.index.isin(test.index)]

In [11]:
test.head(),test.shape, train.head(), train.shape

(      type                                               text  \
 1580   ham  I shall book chez jules for half eight, if tha...   
 4816   ham  Me too. Mark is taking forever to pick up my p...   
 765   spam  UR awarded a City Break and could WIN a å£200 ...   
 3442   ham  Your board is working fine. The issue of overh...   
 418    ham  Alright, I'll head out in a few minutes, text ...   
 
                                                   clean  
 1580  [i, shall, book, chez, jules, for, half, eight...  
 4816  [me, too., mark, is, taking, forever, to, pick...  
 765   [ur, awarded, a, city, break, and, could, win,...  
 3442  [your, board, is, working, fine., the, issue, ...  
 418   [alright,, i'll, head, out, in, a, few, minute...  ,
 (557, 3),
    type                                               text  \
 0   ham  Go until jurong point, crazy.. Available only ...   
 1   ham                      Ok lar... Joking wif u oni...   
 2  spam  Free entry in 2 a wkly comp to win FA

### Prior Probability

In [12]:
ham_spam_ctr = ctr(train.type)

def Pa(X=''): #probability of prior
    return ham_spam_ctr[X] / len(train)

In [13]:
Pa('spam')

0.1333998005982054

# Normalization

In [14]:
words_ctr = ctr([word for row in train.clean for word in row])

## P(B)
def Pb(W=''):
    if W not in words_ctr: return 10E-10 ## if a word is not in the corpus assign it a tiny value
    return words_ctr[W] / sum(words_ctr.values())

## Maximum likelihood P(B|A) 

In [15]:
words_ham_spam = {}

for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name] ## masking the dataframe
    words_ham_spam[col_name] = []
    for row in sub_df.clean:
        for word in row:
            words_ham_spam[col_name].append(word) ## makes 1 long list of all the words and we can count them
    words_ham_spam[col_name] = ctr(words_ham_spam[col_name])
    
def Pba(W='', X=''):
    t = words_ham_spam[X]
    if W not in t: return 10E-10
    return t[W] / sum(t.values())

In [16]:
words_ham_spam['spam'].most_common(10)

[('to', 603),
 ('a', 342),
 ('call', 311),
 ('your', 233),
 ('you', 224),
 ('the', 183),
 ('for', 181),
 ('or', 175),
 ('free', 166),
 ('2', 154)]

In [17]:
words_ham_spam['ham'].most_common(10)

[('i', 1947),
 ('you', 1501),
 ('to', 1383),
 ('the', 999),
 ('a', 941),
 ('u', 790),
 ('and', 756),
 ('in', 702),
 ('my', 669),
 ('is', 637)]

In [18]:
Pba(W='the', X='ham')

0.01616949646342845

In [19]:
def Pab(X='', W=''):
    return Pba(W, X) * Pa(X) / Pb(W)

In [20]:
Pab(X='ham', W='the')

0.9217419913694499

In [21]:
def Ps(T, X=''):
    return np.prod([Pab(X=X,W=word) for word in T])

In [22]:
Ps(T = ['the', 'day', 'ahoy'], X = 'ham'), Ps(T = ['the', 'day', 'ahoy'], X = 'spam')

(0.8183465817290482, 0.0005280595565358195)

# Evaluate

In [49]:
test['ham'] = round(test.clean.apply(lambda x: Ps(x, 'ham')),5)
test['spam'] = round(test.clean.apply(lambda x: Ps(x, 'spam')), 5)

In [50]:
test

Unnamed: 0,type,text,clean,ham,spam,winner
1580,ham,"I shall book chez jules for half eight, if tha...","[i, shall, book, chez, jules, for, half, eight...",0.36486,0.0,ham
4816,ham,Me too. Mark is taking forever to pick up my p...,"[me, too., mark, is, taking, forever, to, pick...",0.91525,0.0,ham
765,spam,UR awarded a City Break and could WIN a å£200 ...,"[ur, awarded, a, city, break, and, could, win,...",0.00000,0.0,spam
3442,ham,Your board is working fine. The issue of overh...,"[your, board, is, working, fine., the, issue, ...",0.29939,0.0,ham
418,ham,"Alright, I'll head out in a few minutes, text ...","[alright,, i'll, head, out, in, a, few, minute...",0.34303,0.0,ham
...,...,...,...,...,...,...
183,ham,"He will, you guys close?","[he, will,, you, guys, close?]",0.78710,0.0,ham
2081,ham,I'm aight. Wat's happening on your side.,"[i'm, aight., wat's, happening, on, your, side.]",0.64026,0.0,ham
1702,ham,I have lost 10 kilos as of today!,"[i, have, lost, 10, kilos, as, of, today!]",0.04400,0.0,ham
5458,ham,"Sorry, I'll call later","[sorry,, i'll, call, later]",0.51855,0.0,ham


What do the columns of ham and spam mean? They are > 0, but not < 0.
Its >1 because we have the 0.0001 assigned to undefined words.

In [51]:
test['winner'] = test.ham / test.spam
test['winner'] = test.winner.apply(lambda x: 'ham' if x > 1 else 'spam')

In [52]:
test.head()

Unnamed: 0,type,text,clean,ham,spam,winner
1580,ham,"I shall book chez jules for half eight, if tha...","[i, shall, book, chez, jules, for, half, eight...",0.36486,0.0,ham
4816,ham,Me too. Mark is taking forever to pick up my p...,"[me, too., mark, is, taking, forever, to, pick...",0.91525,0.0,ham
765,spam,UR awarded a City Break and could WIN a å£200 ...,"[ur, awarded, a, city, break, and, could, win,...",0.0,0.0,spam
3442,ham,Your board is working fine. The issue of overh...,"[your, board, is, working, fine., the, issue, ...",0.29939,0.0,ham
418,ham,"Alright, I'll head out in a few minutes, text ...","[alright,, i'll, head, out, in, a, few, minute...",0.34303,0.0,ham


In [53]:
result = test.winner == test.type

In [54]:
sum(result) / len(test)

0.9676840215439856

most common class baseline - ie 'ham' is 86% of the time, so if we just classed 'ham' then we'd get 86% accuracy. We are 1% more accurate.  We can try to remove stop words, stem the words, spelling correction, balancing by the 'type', etc.


In [55]:
test.text[1580]

"I shall book chez jules for half eight, if that's ok with you?"

In [56]:
test.clean[1580]

['i',
 'shall',
 'book',
 'chez',
 'jules',
 'for',
 'half',
 'eight,',
 'if',
 "that's",
 'ok',
 'with',
 'you?']

P(B|A) = P(B|A)*P(A) / P(B)

In [57]:
# "I shall book chez jules for half eight, if that's ok with you?"
(Pba(W = 'i', X = 'ham') * Pa(X = 'ham') / Pb(W = 'i')) * (Pba(W = 'shall', X = 'ham') * Pa(X = 'ham') / Pb(W = 'shall')) *(Pba(W = 'book', X = 'ham') * Pa(X = 'ham') / Pb(W = 'book')) *(Pba(W = 'chez', X = 'ham') * Pa(X = 'ham') / Pb(W = 'chez')) *(Pba(W = 'jules', X = 'ham') * Pa(X = 'ham') / Pb(W = 'jules')) *(Pba(W = 'for', X = 'ham') * Pa(X = 'ham') / Pb(W = 'for')) *(Pba(W = 'half', X = 'ham') * Pa(X = 'ham') / Pb(W = 'half')) *(Pba(W = 'eight,', X = 'ham') * Pa(X = 'ham') / Pb(W = 'eight,')) *(Pba(W = 'if', X = 'ham') * Pa(X = 'ham') / Pb(W = 'if')) *(Pba(W = "that's", X = 'ham') * Pa(X = 'ham') / Pb(W = "that's")) *(Pba(W = 'ok', X = 'ham') * Pa(X = 'ham') / Pb(W = 'ok')) *(Pba(W = 'with', X = 'ham') * Pa(X = 'ham') / Pb(W = 'with')) *(Pba(W = 'you?', X = 'ham') * Pa(X = 'ham') / Pb(W = 'you?')) 


0.3648593520785536

In [58]:
Pab(X='ham', W='i') ## this is greater than 1 due to the correction provided by non-defined words

1.0718717803144129

In [59]:
Ps(T = ['i',
 'shall',
 'book',
 'chez',
 'jules',
 'for',
 'half',
 'eight,',
 'if',
 "that's",
 'ok',
 'with',
 'you?'], X = 'ham')

0.3648593520785536