In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

# Read Data

In [2]:
#read dataset
spam_df = pd.read_csv('spam.csv', encoding="ISO-8859-1")

#subset and rename columns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation
spam_df.text = spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
spam_df = spam_df.sample(frac=1)

In [3]:
spam_df

Unnamed: 0,spam,text
5195,False,darren was saying dat if u meeting da ge den w...
5533,False,hey chief can you give me a bell when you get ...
1348,False,nothing much chillin at home any super bowl plan
3125,False,u coming 2 pick me
865,True,congratulations ur awarded either a yrs supply...
...,...,...
4351,False,hows the pain deary r u smiling
1898,False,wat would u like 4 ur birthday
4993,False,my drive can only be read i need to write
283,False,okie


In [4]:
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')

congratulations ur awarded either a yrs supply of cds from virgin records or a mystery gift guaranteed call 09061104283 tscs wwwsmsconet å£150pm approx 3mins
-------
please call our customer service representative on freephone 0808 145 4742 between 9am11pm as you have won a guaranteed å£1000 cash or å£5000 prize
-------
this msg is for your mobile content order it has been resent as previous attempt failed due to network error queries to customersqueriesnetvisionukcom
-------
urgent we are trying to contact u todays draw shows that you have won a å£800 prize guaranteed call 09050003091 from land line claim c52 valid 12hrs only
-------
sms ac sun0819 posts helloyou seem cool
-------


In [5]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')

darren was saying dat if u meeting da ge den we dun meet 4 dinner cos later u leave xy will feel awkward den u meet him 4 lunch lor
-------
hey chief can you give me a bell when you get this need to talk to you about this royal visit on the 1st june 
-------
nothing much chillin at home any super bowl plan
-------
u coming 2 pick me
-------
see i knew giving you a break a few times woul lead to you always wanting to miss curfew i was gonna gibe you til one but a midnight movie is not gonna get out til after 2 you need to come home you need to getsleep and if anything you need to b studdying ear training
-------


In [6]:
#get training set
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [7]:
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.13794871794871794


# Create Spam Bag of Words and Non-Spam Bag of Words

In [8]:
#get all words from spam and non-spam datasets
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')

common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [9]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [10]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

# Predict on Test Set

# $ P(\text{SPAM} | \text{"urgent please call this number"}) $
# $\propto P(\text{"urgent please call this number"} | \text{SPAM}) \times P(\text{SPAM}) $
# $= P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM})$

# Due to numerical issues, equivalently  compute:

# $log(P(\text{"urgent"} | \text{SPAM}) \times P(\text{"please"} | \text{SPAM}) \times \dots \times P(\text{SPAM}))$
# $ = log(P(\text{"urgent"} | \text{SPAM})) + log(P(\text{"please"} | \text{SPAM})) + \dots + log(P(\text{SPAM}))$

In [11]:
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]
    
    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)
    
    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [12]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob       ratio
0  urgent   0.003879       0.000021  188.634600
1    call   0.018929       0.003311    5.717620
2    this   0.005275       0.003537    1.491529
3  number   0.001629       0.001049    1.553461
Spam Score: -23.16448028206801
Non-Spam Score: -29.15569965721826


True

In [13]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000310       0.001522  0.203929
1       do   0.001241       0.005223  0.237650
2      you   0.016447       0.025992  0.632762
3     want   0.001474       0.002365  0.623314
4       to   0.039488       0.022311  1.769862
5       go   0.001474       0.003619  0.407279
6        a   0.021567       0.014950  1.442653
7    movie   0.000155       0.000288  0.538956
8  tonight   0.000078       0.000864  0.089826
Spam Score: -59.20117350818938
Non-Spam Score: -50.42256289899204


False

In [14]:
predict_text('offer for unlimited money call now'.split(), verbose=True)

        word  spam_prob  non_spam_prob      ratio
0      offer   0.001552       0.000144  10.779120
1        for   0.010939       0.007197   1.519856
2  unlimited   0.000698       0.000062  11.318076
3      money   0.000078       0.000802   0.096736
4       call   0.018929       0.003311   5.717620
5        now   0.010784       0.003989   2.703114
Spam Score: -38.192756947863074
Non-Spam Score: -41.98513617615185


True

In [15]:
predict_text('are you at class yet'.split(), verbose=True)

    word  spam_prob  non_spam_prob     ratio
0    are   0.004422       0.006005  0.736450
1    you   0.016447       0.025992  0.632762
2     at   0.001396       0.005778  0.241667
3  class   0.000155       0.000658  0.235793
4    yet   0.000078       0.000576  0.134739
Spam Score: -36.318752270659445
Non-Spam Score: -28.85333457585457


False

In [16]:
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))

In [17]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_spam_df.spam == True)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.9234449760765551


In [18]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_spam_df.spam == False)) / np.sum(test_spam_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.02323991797676008
