In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

# Read Data

In [2]:
#read dataset
spam_df = pd.read_csv('spam.csv', encoding="ISO-8859-1")

#subset and rename columns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation
spam_df.text = spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
spam_df = spam_df.sample(frac=1)

In [3]:
spam_df

Unnamed: 0,spam,text
3781,False,how r ì going to send it to me
3668,False,yeah imma come over cause jay wants to do some...
769,False,lol i know theyre so dramatic schools already ...
3814,False,not yet chikkuk then wat abt tht guy did he st...
3110,False,short but cute be a good person
...,...,...
2827,False,wifehow she knew the time of murder exactly
3714,False,i am lateso call you tomorrow morningtake care...
1186,False,goodmorning today i am late for ltgt min
2141,False,alright took the morphine back in yo


In [4]:
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')

do you want a new video handset 750 anytime any network mins half price line rental camcorder reply or call 08000930705 for delivery tomorrow
-------
yes the only place in town to meet exciting adult singles is now in the uk txt chat to 86688 now 150pmsg
-------
08714712388 between 10am7pm cost 10p
-------
urgent this is our 2nd attempt to contact u your å£900 prize from yesterday is still awaiting collection to claim call now 09061702893
-------
married local women looking for discreet action now 5 real matches instantly to your phone text match to 69969 msg cost 150p 2 stop txt stop bcmsfwc1n3xx
-------


In [5]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')

how r ì going to send it to me
-------
yeah imma come over cause jay wants to do some drugs
-------
lol i know theyre so dramatic schools already closed for tomorrow apparently we cant drive in the inch of snow were supposed to get
-------
not yet chikkuk then wat abt tht guy did he stopped irritating or msging to u
-------
short but cute   be a good person 
-------


In [6]:
#get training set
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [7]:
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.13384615384615384


# Create Spam Bag of Words and Non-Spam Bag of Words

In [29]:
spam_text = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
len(spam_text)

12336

In [30]:
#get all words from spam and non-spam datasets
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')

common_words = set(train_spam_words).intersection(set(train_non_spam_words))
len(common_words)

837

In [31]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [32]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

# Predict on Test Set

# $ P(\text{SPAM} | \text{"urgent please call this number"}) $
# $\propto P(\text{"urgent please call this number"} | \text{SPAM}) \times P(\text{SPAM}) $
# $= P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM})$

# Due to numerical issues, equivalently  compute:

# $log(P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM}))$
# $ = log(P(\text{"urgent"} | \text{SPAM})) + log(P(\text{"please"} | \text{SPAM})) + \dots + log(P(\text{SPAM}))$

In [27]:
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]
    
    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)
    
    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [12]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob       ratio
0  urgent   0.003879       0.000021  188.634600
1    call   0.018929       0.003311    5.717620
2    this   0.005275       0.003537    1.491529
3  number   0.001629       0.001049    1.553461
Spam Score: -23.16448028206801
Non-Spam Score: -29.15569965721826


True

In [13]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000310       0.001522  0.203929
1       do   0.001241       0.005223  0.237650
2      you   0.016447       0.025992  0.632762
3     want   0.001474       0.002365  0.623314
4       to   0.039488       0.022311  1.769862
5       go   0.001474       0.003619  0.407279
6        a   0.021567       0.014950  1.442653
7    movie   0.000155       0.000288  0.538956
8  tonight   0.000078       0.000864  0.089826
Spam Score: -59.20117350818938
Non-Spam Score: -50.42256289899204


False

In [14]:
predict_text('offer for unlimited money call now'.split(), verbose=True)

        word  spam_prob  non_spam_prob      ratio
0      offer   0.001552       0.000144  10.779120
1        for   0.010939       0.007197   1.519856
2  unlimited   0.000698       0.000062  11.318076
3      money   0.000078       0.000802   0.096736
4       call   0.018929       0.003311   5.717620
5        now   0.010784       0.003989   2.703114
Spam Score: -38.192756947863074
Non-Spam Score: -41.98513617615185


True

In [15]:
predict_text('are you at class yet'.split(), verbose=True)

    word  spam_prob  non_spam_prob     ratio
0    are   0.004422       0.006005  0.736450
1    you   0.016447       0.025992  0.632762
2     at   0.001396       0.005778  0.241667
3  class   0.000155       0.000658  0.235793
4    yet   0.000078       0.000576  0.134739
Spam Score: -36.318752270659445
Non-Spam Score: -28.85333457585457


False

In [47]:
test_spam_df

Unnamed: 0,spam,text
1863,False,ill let you know when it kicks in
85,False,i call you later dont have network if urgnt sm...
4498,True,sms services for your inclusive text credits p...
4458,True,welcome to ukmobiledate this msg is free givin...
1639,True,freemsgfeelin kinda lnly hope u like 2 keep me...
...,...,...
2827,False,wifehow she knew the time of murder exactly
3714,False,i am lateso call you tomorrow morningtake care...
1186,False,goodmorning today i am late for ltgt min
2141,False,alright took the morphine back in yo


In [33]:
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))
predictions

1863    False
85      False
4498     True
4458     True
1639    False
        ...  
2827    False
3714    False
1186    False
2141    False
4088    False
Name: text, Length: 1672, dtype: bool

In [40]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_spam_df.spam == True)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.8844444444444445


In [62]:
np.sum((predictions == True) & (test_spam_df.spam == True))

199

In [51]:
pred_true = (predictions == True)
pred_true
test_spam_true = (test_spam_df.spam == True)
test_spam_true
combined_pred_test = np.logical_and(pred_true, test_spam_true)
np.sum(combined_pred_test)

199

In [56]:
frac_spam_messages_correctly_detected_2 = np.sum(np.logical_and(predictions, test_spam_df.spam)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected_2)

Fraction Spam Correctly Detected: 0.8844444444444445


In [43]:
temp = np.sum((predictions == True) & (test_spam_df.spam == True))
temp

199

In [45]:
np.sum(predictions == True)

234

In [46]:
np.sum(test_spam_df.spam == True)

225

In [18]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_spam_df.spam == False)) / np.sum(test_spam_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.02323991797676008


In [59]:
test_spam_df.spam

1863    False
85      False
4498     True
4458     True
1639     True
        ...  
2827    False
3714    False
1186    False
2141    False
4088    False
Name: spam, Length: 1672, dtype: bool

In [60]:
(test_spam_df.spam == False)

1863     True
85       True
4498    False
4458    False
1639    False
        ...  
2827     True
3714     True
1186     True
2141     True
4088     True
Name: spam, Length: 1672, dtype: bool

In [58]:
pred_true = (predictions == True)

test_spam_false = (test_spam_df.spam == False)
combined_pred_test_2 = np.logical_and(pred_true, test_spam_false)
qbools = pd.DataFrame()
qbools['pred_true'] = pred_true
qbools['test_spam_false'] = test_spam_false
qbools['and_result'] = combined_pred_test_2
qbools.head(10)



Unnamed: 0,pred_true,test_spam_false,and_result
1863,False,True,False
85,False,True,False
4498,True,False,False
4458,True,False,False
1639,False,False,False
3021,False,True,False
4436,False,True,False
3734,False,True,False
5519,False,True,False
1954,False,True,False


In [None]:
frac_valid_sent_to_spam_2 = np.sum(np.logical_and(predictions, test_spam_df.spam)) 