In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

### Read the data

In [2]:
#read dataset
spam_df = pd.read_csv('archive/spam.csv', encoding="ISO-8859-1")

#subset and rename columns
spam_df = spam_df[['v1', 'v2']]
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to binary
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation
spam_df.text = spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
spam_df = spam_df.sample(frac=1)

In [8]:
spam_df.describe

<bound method NDFrame.describe of        spam                                               text
5415  False  you should get more chicken broth if you want ...
5006  False                         oh k  i will come tomorrow
393   False  yes i think so i am in office but my lap is in...
4343  False     hidid you asked to waheeda fathima about leave
5412  False  gr8 so how do you handle the victoria island t...
...     ...                                                ...
4246   True  text pass to 69669 to collect your polyphonic ...
228   False  life is more strict than teacher bcoz teacher ...
1400  False  no i decided that only people who care about s...
4712  False  big brotherû÷s really scraped the barrel with...
5301  False                at what time should i come tomorrow

[5572 rows x 2 columns]>

In [6]:
# Get a glimpse on the spam like texts
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')

someone has contacted our dating service and entered your phone because they fancy you to find out who it is call from a landline 09111032124  pobox12n146tf150p
-------
cmon babe make me horny turn me on txt me your fantasy now babe  im hot sticky and need you now all replies cost å£150 2 cancel send stop
-------
great new offer  double mins  double txt on best orange tariffs and get latest camera phones 4 free call mobileupd8 free on 08000839402 now or 2stoptxt tcs
-------
how come it takes so little time for a child who is afraid of the dark to become a teenager who wants to stay out all night
-------
thanks 4 your continued support your question this week will enter u in2 our draw 4 å£100 cash name the new us president txt ans to 80082
-------


In [9]:
# Get a glimpse on the not spam like texts
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')

you should get more chicken broth if you want ramen unless theres some i dont know about
-------
oh k  i will come tomorrow
-------
yes i think so i am in office but my lap is in room i think thats on for the last few days i didnt shut that down
-------
hidid you asked to waheeda fathima about leave
-------
gr8 so how do you handle the victoria island traffic plus whens the album due
-------


In [11]:
#get training set
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [12]:
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.13307692307692306


# Create Spam Bag of Words and Non-Spam Bag of Words

In [29]:
#get all words from spam and non-spam datasets
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')
# Why do we focus on the intersection :
# If a word occurs in a spam text and not in a non spam message, it's a little bit difficult
# for us to say whether it implies that it's a spam/non spam email
common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [16]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [18]:
train_non_spam_bow = dict()
for w in common_words:
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

# Predict the class of the email using the BOW representation

In [21]:
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    valid_words = [w for w in t if w in train_spam_bow]
    
    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)
     
    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)
    
    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)
    
    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)
   
    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [22]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob      ratio
0  urgent   0.003806       0.000041  91.920202
1    call   0.020324       0.002981   6.817958
2    this   0.005263       0.003540   1.486827
3  number   0.001862       0.000973   1.914137
Spam Score: -23.016991290509715
Non-Spam Score: -28.62936041776485


True

In [23]:
predict_text('hey do you want to go a movie tonight'.split(), verbose=True)

      word  spam_prob  non_spam_prob     ratio
0      hey   0.000243       0.001532  0.158574
1       do   0.001377       0.005506  0.249983
2      you   0.017166       0.026684  0.643319
3     want   0.001700       0.002339  0.726916
4       to   0.036275       0.022419  1.618053
5       go   0.001781       0.003581  0.497416
6        a   0.022348       0.015029  1.487016
7    movie   0.000081       0.000269  0.300884
8  tonight   0.000162       0.000766  0.211432
Spam Score: -58.9672119502868
Non-Spam Score: -50.53002048942636


False

In [24]:
predict_text('offer for unlimited money call now'.split(), verbose=True)

        word  spam_prob  non_spam_prob      ratio
0      offer   0.001377       0.000083  16.623866
1        for   0.011093       0.007328   1.513772
2  unlimited   0.000729       0.000041  17.601741
3      money   0.000081       0.000766   0.105716
4       call   0.020324       0.002981   6.817958
5        now   0.010121       0.004099   2.469380
Spam Score: -38.241110477633946
Non-Spam Score: -43.037058871736974


True

In [25]:
predict_text('are you at class yet'.split(), verbose=True)

    word  spam_prob  non_spam_prob     ratio
0    are   0.004534       0.005879  0.771281
1    you   0.017166       0.026684  0.643319
2     at   0.001457       0.005279  0.276106
3  class   0.000162       0.000497  0.325958
4    yet   0.000162       0.000642  0.252355
Spam Score: -35.46528057480925
Non-Spam Score: -29.10556299169752


False

In [26]:
predictions = test_spam_df.text.apply(lambda t: predict_text(t.split()))

In [27]:
frac_spam_messages_correctly_detected = np.sum((predictions == True) & (test_spam_df.spam == True)) / np.sum(test_spam_df.spam == True)
print('Fraction Spam Correctly Detected: %s'%frac_spam_messages_correctly_detected)

Fraction Spam Correctly Detected: 0.8903508771929824


In [28]:
frac_valid_sent_to_spam = np.sum((predictions == True) & (test_spam_df.spam == False)) / np.sum(test_spam_df.spam == False)
print('Fraction Valid Messages Sent to Spam: %s'%frac_valid_sent_to_spam)

Fraction Valid Messages Sent to Spam: 0.01592797783933518
