In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

In [2]:
#Read dataset
spam_df = pd.read_csv('spam.csv', encoding="ISO-8859-1")
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
#subset and rename columns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation

spam_df.text = spam_df.text.apply(lambda t: t.lower())

for punctuation in string.punctuation:
    spam_df.text = spam_df.text.apply(lambda t: t.replace(punctuation, '')) 


#shuffle
spam_df = spam_df.sample(frac=1)

In [4]:
spam_df

Unnamed: 0,spam,text
4355,False,great so should i send you my account number
3838,False,erm û ill pick you up at about 645pm thatll g...
4049,False,ok ok take care i can understand
4320,False,k jason says hes gonna be around so ill be up ...
3660,False,well then you have a great weekend
...,...,...
2434,False,uncle boye i need movies oh guide me plus you ...
3356,False,brainless baby dolld vehicle sariyag drive mad...
1564,False,tmrw im finishing 9 doors
3974,False,do u think that any girl will propose u today ...


In [5]:
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')

burger king  wanna play footy at a top stadium get 2 burger king before 1st sept and go large or super with cocacola and walk out a winner
-------
back 2 work 2morro half term over can u c me 2nite 4 some sexy passion b4 i have 2 go back chat now 09099726481 luv dena calls å£1minmobsmorelkpobox177hp51fl
-------
jamster to get your free wallpaper text heart to 88888 now tc apply 16 only need help call 08701213186
-------
fancy a shag i dointerested sextextukcom txt xxuk suzy to 69876 txts cost 150 per msg tncs on website x
-------
u 447801259231 have a secret admirer who is looking 2 make contact with ufind out who they rreveal who thinks ur so specialcall on 09058094597
-------


In [6]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')

great so should i send you my account number
-------
erm û ill pick you up at about 645pm thatll give enough time to get there park and that
-------
ok ok take care i can understand
-------
k jason says hes gonna be around so ill be up there around  ltgt
-------
well then you have a great weekend
-------


In [7]:
#get training set
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [8]:
train_spam_df

Unnamed: 0,spam,text
4355,False,great so should i send you my account number
3838,False,erm û ill pick you up at about 645pm thatll g...
4049,False,ok ok take care i can understand
4320,False,k jason says hes gonna be around so ill be up ...
3660,False,well then you have a great weekend
...,...,...
3045,False,so hows the weather over there
51,False,a gram usually runs like ltgt a half eighth ...
4170,False,pls whats the full name of jokes school cos fe...
988,False,yun ahthe ubi one say if ì wan call by tomorro...


In [9]:
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.13256410256410256


# Create Spam Bag of Words and Non-Spam Bag of Words

In [10]:
spam_text = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
len(spam_text)

12405

In [11]:
#get all words from spam and non-spam datasets
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')

common_words = set(train_spam_words).intersection(set(train_non_spam_words))
len(common_words)

855

In [12]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [13]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

# Predict on Test Set

### $ P(\text{SPAM} | \text{"urgent please call this number"}) $
### $\propto P(\text{"urgent please call this number"} | \text{SPAM}) \times P(\text{SPAM}) $
### $= P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM})$

### Due to numerical issues, equivalently  compute:

### $log(P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM}))$
### $ = log(P(\text{"urgent"} | \text{SPAM})) + log(P(\text{"please"} | \text{SPAM})) + \dots + log(P(\text{SPAM}))$

In [14]:
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]
    
    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)
    
    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [15]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob       ratio
0  urgent   0.002821       0.000020  138.645707
1    call   0.018944       0.003419    5.541112
2    this   0.005079       0.003460    1.468013
3  number   0.001370       0.000895    1.530505
Spam Score: -23.732826896031593
Non-Spam Score: -29.307976197265926


True

In [16]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000403       0.001465  0.275091
1       do   0.001612       0.005454  0.295620
2      you   0.015558       0.026842  0.579630
3     want   0.002096       0.002442  0.858283
4       to   0.038452       0.022711  1.693139
5       go   0.001612       0.003867  0.416980
6        a   0.020476       0.015181  1.348756
7    movie   0.000081       0.000305  0.264087
8  tonight   0.000161       0.000875  0.184247
Spam Score: -58.33369589780714
Non-Spam Score: -50.175791350886016


False

In [17]:
predict_text('offer for unlimited money call now'.split(), verbose=True)

        word  spam_prob  non_spam_prob      ratio
0      offer   0.001290       0.000102  12.676179
1        for   0.011366       0.007468   1.521919
2  unlimited   0.000403       0.000061   6.602177
3      money   0.000161       0.000794   0.203144
4       call   0.018944       0.003419   5.541112
5        now   0.011205       0.004009   2.795033
Spam Score: -38.157825181477875
Non-Spam Score: -42.27264413636038


True

In [18]:
predict_text('are you at class yet'.split(), verbose=True)

  word  spam_prob  non_spam_prob     ratio
0  are   0.003950       0.006044  0.653549
1  you   0.015558       0.026842  0.579630
2   at   0.001532       0.005596  0.273690
3  yet   0.000161       0.000773  0.208490
Spam Score: -26.93201189537102
Non-Spam Score: -21.21920969872474


False

In [19]:
test_spam_df

Unnamed: 0,spam,text
4509,False,this weekend is fine an excuse not to do too m...
798,False,ok i msg u b4 i leave my house
122,True,todays voda numbers ending 7548 are selected t...
5563,False,ard 6 like dat lor
4563,False,tell me again what your address is
...,...,...
2434,False,uncle boye i need movies oh guide me plus you ...
3356,False,brainless baby dolld vehicle sariyag drive mad...
1564,False,tmrw im finishing 9 doors
3974,False,do u think that any girl will propose u today ...


In [20]:
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))
predictions

4509    False
798     False
122      True
5563    False
4563    False
        ...  
2434    False
3356    False
1564    False
3974    False
4471     True
Name: text, Length: 1672, dtype: bool

# Metric 1: Fraction Spam Correctly Detected (True positive rate / Recall) 

In [21]:
np.sum((predictions == True) & (test_spam_df.spam == True))

211

In [22]:
 np.sum(test_spam_df.spam == True)

230

In [23]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_spam_df.spam == True)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.9173913043478261


### Alternative code to calculate Fraction Spam Correctly Detected

In [24]:
pred_true = (predictions == True)
pred_true

4509    False
798     False
122      True
5563    False
4563    False
        ...  
2434    False
3356    False
1564    False
3974    False
4471     True
Name: text, Length: 1672, dtype: bool

In [25]:
test_spam_true = (test_spam_df.spam == True)
test_spam_true
combined_pred_test = np.logical_and(pred_true, test_spam_true)
combined_pred_test

4509    False
798     False
122      True
5563    False
4563    False
        ...  
2434    False
3356    False
1564    False
3974    False
4471     True
Length: 1672, dtype: bool

In [26]:
np.sum(combined_pred_test)

211

In [27]:
frac_spam_messages_correctly_detected_2 = np.sum(np.logical_and(predictions, test_spam_df.spam)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected_2)

Fraction Spam Correctly Detected: 0.9173913043478261


# Metric 2: Fraction of valid messages sent to spam (False positive rate)

In [28]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_spam_df.spam == False)) / np.sum(test_spam_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.027045769764216365
