In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

In [36]:
# Read Data

#read dataset
spam_df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [37]:
#subset and rename columns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation

spam_df.text = spam_df.text.apply(lambda t: t.lower())

for punctuation in string.punctuation:
    spam_df.text = spam_df.text.apply(lambda t: t.replace(punctuation, '')) 
    
#spam_df.text = spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
spam_df = spam_df.sample(frac=1)

In [38]:
spam_df

Unnamed: 0,spam,text
2183,False,chinatown got porridge claypot rice yam cake f...
2328,False,that day you asked about anand number why
3417,True,life has never been this much fun and great un...
5392,False,ooooooh i forgot to tell u i can get on yovill...
2283,False,i reach home safe n sound liao
...,...,...
1510,False,peteis this your phone still its jenny from co...
2230,False,hey thk we juz go accordin to wat we discussed...
3916,False,eh ur laptop got no stock lei he say mon muz c...
306,False,jos ask if u wana meet up


In [39]:
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')

life has never been this much fun and great until you came in you made it truly special for me i wont forget you enjoy  one gbpsms
-------
upgrdcentre orange customer you may now claim your free camera phone upgrade for your loyalty call now on 0207 153 9153 offer ends 26th july tcs apply optout available
-------
from wwwapplausestorecom monthlysubscription50pmsg max6month tcsc web age16 2stop txt stop
-------
urgent call 09066649731from landline your complimentary 4 ibiza holiday or å£10000 cash await collection sae tcs po box 434 sk3 8wp 150ppm 18
-------
get 3 lions england tone reply lionm 4 mono or lionp 4 poly 4 more go 2 wwwringtonescouk the original n best tones 3gbp network operator rates apply
-------


In [40]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')

chinatown got porridge claypot rice yam cake fishhead beehoon either we eat cheap den go cafe n tok or go nydc or somethin
-------
that day you asked about anand number why
-------
ooooooh i forgot to tell u i can get on yoville on my phone
-------
i reach home safe n sound liao
-------
hello hows you and how did saturday go i was just texting to see if youd decided to do anything tomo not that im trying to invite myself or anything
-------


In [41]:
#get training set
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [43]:
train_spam_df

Unnamed: 0,spam,text
2183,False,chinatown got porridge claypot rice yam cake f...
2328,False,that day you asked about anand number why
3417,True,life has never been this much fun and great un...
5392,False,ooooooh i forgot to tell u i can get on yovill...
2283,False,i reach home safe n sound liao
...,...,...
3355,False,ok not a problem will get them a taxi c ing t...
1573,True,sms auction a brand new nokia 7250 is up 4 au...
1996,False,lol boo i was hoping for a laugh
1063,True,we have new local dates in your area lots of ...


In [42]:
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.1341025641025641


# Create Spam Bag of Words and Non-Spam Bag of Words

In [24]:
spam_text = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
len(spam_text)

12152

In [25]:
#get all words from spam and non-spam datasets
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')

common_words = set(train_spam_words).intersection(set(train_non_spam_words))
len(common_words)

858

In [26]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [27]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

# Predict on Test Set

### $ P(\text{SPAM} | \text{"urgent please call this number"}) $
### $\propto P(\text{"urgent please call this number"} | \text{SPAM}) \times P(\text{SPAM}) $
### $= P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM})$

### Due to numerical issues, equivalently  compute:

### $log(P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM}))$
### $ = log(P(\text{"urgent"} | \text{SPAM})) + log(P(\text{"please"} | \text{SPAM})) + \dots + log(P(\text{SPAM}))$

In [44]:
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]
    
    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)
    
    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [45]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob       ratio
0  urgent   0.003621       0.000020  176.847268
1    call   0.019256       0.003337    5.769975
2    this   0.004855       0.003747    1.295826
3  number   0.001399       0.001003    1.394436
Spam Score: -23.479885007917353
Non-Spam Score: -29.134315669682003


True

In [46]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000411       0.001433  0.287090
1       do   0.000987       0.005733  0.172254
2      you   0.014730       0.027517  0.535303
3     want   0.001481       0.002539  0.583440
4       to   0.038759       0.022788  1.700871
5       go   0.001646       0.003706  0.444117
6        a   0.022136       0.015130  1.463031
7    movie   0.000165       0.000348  0.472854
8  tonight   0.000165       0.000839  0.196061
Spam Score: -58.35269353228213
Non-Spam Score: -50.03873506241979


False

In [47]:
predict_text('offer for unlimited money call now'.split(), verbose=True)

        word  spam_prob  non_spam_prob      ratio
0      offer   0.001481       0.000102  14.469322
1        for   0.010780       0.007289   1.478996
2  unlimited   0.000576       0.000082   7.033698
3      money   0.000247       0.000717   0.344508
4       call   0.019256       0.003337   5.769975
5        now   0.010286       0.004218   2.438869
Spam Score: -37.34691826222573
Non-Spam Score: -42.07442664080335


True

In [48]:
predict_text('are you at class yet'.split(), verbose=True)

  word  spam_prob  non_spam_prob     ratio
0  are   0.003703       0.006019  0.615192
1  you   0.014730       0.027517  0.535303
2   at   0.001564       0.005344  0.292590
3  yet   0.000165       0.000594  0.277190
Spam Score: -26.99851208981779
Non-Spam Score: -21.51057079570987


False

In [49]:
test_spam_df

Unnamed: 0,spam,text
2221,False,i notice you like looking in the shit mirror y...
3065,True,get a brand new mobile phone by being an agent...
5070,False,as in different styles
4708,False,haha awesome i might need to take you up on th...
4539,False,urgh coach hot smells of chip fat thanks again...
...,...,...
1510,False,peteis this your phone still its jenny from co...
2230,False,hey thk we juz go accordin to wat we discussed...
3916,False,eh ur laptop got no stock lei he say mon muz c...
306,False,jos ask if u wana meet up


In [51]:
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))
predictions

2221    False
3065     True
5070    False
4708    False
4539     True
        ...  
1510    False
2230    False
3916    False
306     False
1942    False
Name: text, Length: 1672, dtype: bool

# Metric 1: Fraction Spam Correctly Detected (True positive rate / Recall) 

In [62]:
np.sum((predictions == True) & (test_spam_df.spam == True))

203

In [66]:
 np.sum(test_spam_df.spam == True)

224

In [56]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_spam_df.spam == True)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.90625


### Alternative code to calculate Fraction Spam Correctly Detected

In [52]:
pred_true = (predictions == True)
pred_true

2221    False
3065     True
5070    False
4708    False
4539     True
        ...  
1510    False
2230    False
3916    False
306     False
1942    False
Name: text, Length: 1672, dtype: bool

In [54]:
test_spam_true = (test_spam_df.spam == True)
test_spam_true
combined_pred_test = np.logical_and(pred_true, test_spam_true)
combined_pred_test

2221    False
3065     True
5070    False
4708    False
4539    False
        ...  
1510    False
2230    False
3916    False
306     False
1942    False
Length: 1672, dtype: bool

In [55]:
np.sum(combined_pred_test)

203

In [56]:
frac_spam_messages_correctly_detected_2 = np.sum(np.logical_and(predictions, test_spam_df.spam)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected_2)

Fraction Spam Correctly Detected: 0.8844444444444445


# Metric 2: Fraction of valid messages sent to spam (False positive rate)

In [61]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_spam_df.spam == False)) / np.sum(test_spam_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.020027624309392266
