In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 

In [2]:
path = "smsspamcollection/smsspamcollection"
dfa = pd.read_csv(path, sep='\t', header=None, names=['Label', 'SMS'])

In [3]:
dfa

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
dfa['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [5]:
dfa['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [6]:
dfa_random = dfa.sample(frac=1, random_state=1)

In [7]:
split = round(len(dfa_random) * 0.8)

split

4458

In [8]:
training_set = dfa_random[:split].reset_index(drop=True)
test_set = dfa_random[split:].reset_index(drop=True)

In [9]:
training_set.shape

(4458, 2)

In [10]:
test_set.shape

(1114, 2)

In [11]:
training_set['Label'].value_counts(normalize=True)

ham     0.86541
spam    0.13459
Name: Label, dtype: float64

In [12]:
test_set['Label'].value_counts(normalize=True)

ham     0.868043
spam    0.131957
Name: Label, dtype: float64

In [13]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ')
training_set['SMS'] = training_set['SMS'].str.lower()

  """Entry point for launching an IPython kernel.


In [14]:
training_set

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...
...,...,...
4453,ham,sorry i ll call later in meeting any thing re...
4454,ham,babe i fucking love you too you know fuck...
4455,spam,u ve been selected to stay in 1 of 250 top bri...
4456,ham,hello my boytoy geeee i miss you already a...


In [15]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = set()

for sms in training_set['SMS']:
    for word in sms:
        vocabulary.add(word)
        
len(vocabulary)

7783

In [16]:
training_set

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."
...,...,...
4453,ham,"[sorry, i, ll, call, later, in, meeting, any, ..."
4454,ham,"[babe, i, fucking, love, you, too, you, know, ..."
4455,spam,"[u, ve, been, selected, to, stay, in, 1, of, 2..."
4456,ham,"[hello, my, boytoy, geeee, i, miss, you, alrea..."


In [17]:
word_counts_per_sms = {unique_word:[0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [18]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [19]:
word_counts

Unnamed: 0,argentina,3750,wocay,involved,pizza,dabooks,usf,teasing,holla,copied,...,camry,cramps,application,restrictions,mid,cream,kettoda,asks,pierre,3mobile
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
training_set

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."
...,...,...
4453,ham,"[sorry, i, ll, call, later, in, meeting, any, ..."
4454,ham,"[babe, i, fucking, love, you, too, you, know, ..."
4455,spam,"[u, ve, been, selected, to, stay, in, 1, of, 2..."
4456,ham,"[hello, my, boytoy, geeee, i, miss, you, alrea..."


In [22]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean

Unnamed: 0,Label,SMS,argentina,3750,wocay,involved,pizza,dabooks,usf,teasing,...,camry,cramps,application,restrictions,mid,cream,kettoda,asks,pierre,3mobile
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,ham,"[sorry, i, ll, call, later, in, meeting, any, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,ham,"[babe, i, fucking, love, you, too, you, know, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,spam,"[u, ve, been, selected, to, stay, in, 1, of, 2...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,ham,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

In [24]:
p_spam = len(spam_messages)/len(training_set_clean)
p_ham = len(ham_messages)/len(training_set_clean)

In [25]:
# number of spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# number of ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# number of vocabulary 
n_vocabulary = len(vocabulary)

# additive smoothing
alpha = 1

P(w1|spam) 
P(w2|spam) 
..
P(wn|spam)


P(w1|ham) 
P(w2|ham) 
..
P(wn|ham)



In [28]:
# intialise 
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

In [27]:
# calculate parameters 

In [29]:
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum() 
    p_word_given_spam = (n_word_given_spam + alpha) /(n_spam + alpha*n_vocabulary) 
    parameters_spam[word] = p_word_given_spam

    n_word_given_ham = ham_messages[word].sum() 
    p_word_given_ham = (n_word_given_ham + alpha) /(n_ham + alpha*n_vocabulary) 
    parameters_ham[word] = p_word_given_ham