In [1]:
import pandas as pd
import numpy as np
from collections import Counter as ctr

In [2]:
!ls

[34mA1[m[m                               data.tar.gz
A1.tar.xz                        naive-bayes-2021.ipynb
Assignment1_Sarah_Ewing.ipynb    pip_install.ipynb
NLTK_ch1.ipynb                   probability_theory-discrete.html
NLTK_ch2.ipynb                   python_fun.ipynb
catsM.csv                        readme.txt
[34mdata[m[m                             spam.csv


In [3]:
data = pd.read_csv('spam.csv', 
                 skiprows=1, 
                 names = ['type','text','c3','c4','c5'], 
                 delimiter = ',',
                 encoding = 'ISO-8859-1')
data.head()
## text is an entire email
## type is "spam" (bad) & "ham" (good)
# Shouldent we use a SVM?

Unnamed: 0,type,text,c3,c4,c5
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data['clean'] = data.text.apply(lambda x: x.lower().split()) 
## normalize it to lowercase and split on white space (to do tab put "\t")

In [5]:
data.drop(['c3', 'c4', 'c5'], axis = 1, inplace = True)
## axis is 0 is row, 1 is column

In [6]:
data.head()

Unnamed: 0,type,text,clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point,, crazy.., available..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar..., joking, wif, u, oni...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor..., u, c, already..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don't, think, he, goes, to, usf,, he,..."


In [7]:
data.groupby('type').count()

Unnamed: 0_level_0,text,clean
type,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4825,4825
spam,747,747


In [8]:
type_ctr = ctr(data.type)
print(" ham (good):", type_ctr['ham']/len(data),"\n",
      "spam (bad):", type_ctr['spam']/len(data)
)
## dont we need to balance the data?

 ham (good): 0.8659368269921034 
 spam (bad): 0.13406317300789664


In [9]:
## take note how to print two things from the same cell...
type_ctr['ham']/len(data), type_ctr['spam']/len(data)

(0.8659368269921034, 0.13406317300789664)

In [10]:
test = data.sample(frac = 0.1)
# train data is mutually exclusive from the test
train = data[~data.index.isin(test.index)]

In [11]:
test.head(),test.shape, train.head(), train.shape

(      type                                               text  \
 2877  spam  Hey Boys. Want hot XXX pics sent direct 2 ur p...   
 2915   ham    What time should I tell my friend to be around?   
 4741   ham              Ya very nice. . .be ready on thursday   
 514   spam  You are guaranteed the latest Nokia Phone, a 4...   
 1292   ham  Da my birthdate in certificate is in april but...   
 
                                                   clean  
 2877  [hey, boys., want, hot, xxx, pics, sent, direc...  
 2915  [what, time, should, i, tell, my, friend, to, ...  
 4741     [ya, very, nice., ., .be, ready, on, thursday]  
 514   [you, are, guaranteed, the, latest, nokia, pho...  
 1292  [da, my, birthdate, in, certificate, is, in, a...  ,
 (557, 3),
    type                                               text  \
 0   ham  Go until jurong point, crazy.. Available only ...   
 2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
 3   ham  U dun say so early hor... U c already

### Prior Probability

In [12]:
ham_spam_ctr = ctr(train.type)

def Pa(X=''): #probability of prior
    return ham_spam_ctr[X] / len(train)

In [13]:
Pa('spam')

0.13359920239282153

# Normalization

In [14]:
words_ctr = ctr([word for row in train.clean for word in row])

## P(B)
def Pb(W=''):
    if W not in words_ctr: return 10E-10 ## if a word is not in the corpus assign it a tiny value
    return words_ctr[W] / sum(words_ctr.values())

## Maximum likelihood P(B|A) 

In [15]:
words_ham_spam = {}

for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name] ## masking the dataframe
    words_ham_spam[col_name] = []
    for row in sub_df.clean:
        for word in row:
            words_ham_spam[col_name].append(word) ## makes 1 long list of all the words and we can count them
    words_ham_spam[col_name] = ctr(words_ham_spam[col_name])
    
def Pba(W='', X=''):
    t = words_ham_spam[X]
    if W not in t: return 10E-10
    return t[W] / sum(t.values())

In [16]:
words_ham_spam['spam'].most_common(10)

[('to', 623),
 ('a', 336),
 ('call', 302),
 ('your', 236),
 ('you', 222),
 ('for', 181),
 ('the', 180),
 ('or', 165),
 ('free', 161),
 ('2', 160)]

In [17]:
words_ham_spam['ham'].most_common(10)

[('i', 1981),
 ('you', 1507),
 ('to', 1374),
 ('the', 991),
 ('a', 924),
 ('u', 782),
 ('and', 755),
 ('in', 704),
 ('my', 689),
 ('is', 622)]

In [18]:
Pba(W='the', X='ham')

0.016087662337662338

In [19]:
def Pab(X='', W=''):
    return Pba(W, X) * Pa(X) / Pb(W)

In [20]:
Pab(X='ham', W='the')

0.924252763394579

In [21]:
def Ps(T, X=''):
    return np.prod([Pab(X=X,W=word) for word in T])

In [22]:
Ps(T = ['the', 'day', 'ahoy'], X = 'ham'), Ps(T = ['the', 'day', 'ahoy'], X = 'spam')

(0.8127111764284565, 0.0006066885959318133)

# Evaluate

In [23]:
test['ham'] = test.clean.apply(lambda x: Ps(x, 'ham'))
test['spam'] = test.clean.apply(lambda x: Ps(x, 'spam'))

In [24]:
test

Unnamed: 0,type,text,clean,ham,spam
2877,spam,Hey Boys. Want hot XXX pics sent direct 2 ur p...,"[hey, boys., want, hot, xxx, pics, sent, direc...",1.917566e-20,2.428565e-30
2915,ham,What time should I tell my friend to be around?,"[what, time, should, i, tell, my, friend, to, ...",8.018706e-01,2.844581e-18
4741,ham,Ya very nice. . .be ready on thursday,"[ya, very, nice., ., .be, ready, on, thursday]",1.034931e+00,1.777306e-23
514,spam,"You are guaranteed the latest Nokia Phone, a 4...","[you, are, guaranteed, the, latest, nokia, pho...",2.634663e-57,2.656855e-11
1292,ham,Da my birthdate in certificate is in april but...,"[da, my, birthdate, in, certificate, is, in, a...",1.381174e-01,7.201132e-75
...,...,...,...,...,...
2399,ham,Oh... Kay... On sat right?,"[oh..., kay..., on, sat, right?]",9.982615e-01,1.282663e-19
4206,ham,"Lets use it next week, princess :)","[lets, use, it, next, week,, princess, :)]",6.101299e-01,1.978136e-17
379,ham,I taught that Ranjith sir called me. So only i...,"[i, taught, that, ranjith, sir, called, me., s...",3.276566e-01,4.909528e-65
690,ham,Was the farm open?,"[was, the, farm, open?]",9.235823e-01,2.939036e-09


What do the columns of ham and spam mean? They are > 0, but not < 0.
Its >1 because we have the 0.0001 assigned to undefined words.

In [25]:
test['winner'] = test.ham / test.spam
test['winner'] = test.winner.apply(lambda x: 'ham' if x > 1 else 'spam')

In [26]:
result = test.winner == test.type

In [27]:
sum(result) / len(test)

0.9640933572710951

most common class baseline - ie 'ham' is 86% of the time, so if we just classed 'ham' then we'd get 86% accuracy. We are 1% more accurate.  We can try to remove stop words, stem the words, spelling correction, balancing by the 'type', etc.
