In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from sklearn import svm
import re
import nltk, nltk.stem.porter


In [26]:
def preProcess(email):
    email=email.lower() # lower the letter
    email=re.sub("<[^<>]+>"," ",email) #strip html tags replace with a space
    email=re.sub("[0-9]+","number",email) #replce numbers with "number"
    email=re.sub("(http|https)://[^s]*","httpaddr",email) # anything with http/https:// replace with "httpaddr"
    email=re.sub("[^\s]+@[^\s]+","emailadd",email)
    email=re.sub("[$]+","dollar",email)
    return email

In [5]:
def email2TokenList(raw_email):
    stemmer =nltk.stem.porter.PorterStemmer()
    email=preProcess( raw_email)
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    tokenlist=[]
    for token in tokens:
        token=re.sub("[^a-zA-Z0-9]","",token)
        stemmed=stemmer.stem(token)
        if not len(token):
            continue
        tokenlist.append(stemmed)
    return tokenlist


In [15]:
def getVocabDict(reverse=False):
    vocab_dict={}
    with open("vocab.txt") as f:
        for line in f:
            (val,key)=line.split()
            if not reverse:
                vocab_dict[key]=int(val)
            else:
                vocab_dict[int(val)]=key
    return vocab_dict


In [19]:
#returns a list of indices corresponding to the location in vocab_dict for each stemmed word 
def emailVocabIndices(raw_email,vocab_dict):
    tokenlist=email2TokenList(raw_email)
    index_list=[vocab_dict[token] for token in tokenlist if token in vocab_dict]
    return index_list

In [22]:

def email2FeatureVector( raw_email, vocab_dict ):
    # returns a vector of shape(n,1) where n is the size of the vocab_dict.
    #he first element in this vector is 1 if the vocab word with index == 1 is in raw_email, else 0
    n = len(vocab_dict)
    result = np.zeros((n,1))
    vocab_indices = emailVocabIndices( email_contents, vocab_dict )
    for idx in vocab_indices:
        result[idx] = 1
    return result

In [27]:
vocab_dict = getVocabDict()
email_contents = open( 'emailSample1.txt', 'r' ).read()
test_fv = email2FeatureVector( email_contents, vocab_dict )

In [30]:
print ("Length of feature vector is %d" % len(test_fv))
print( "Number of non-zero entries is: %d" % sum(test_fv==1))

Length of feature vector is 1899
Number of non-zero entries is: 44


In [35]:
#svm for spam classification
datafile = 'spamTrain.mat'
mat = scipy.io.loadmat( datafile )
X, y = mat['X'], mat['y']
# Test set
datafile = 'spamTest.mat'
mat = scipy.io.loadmat( datafile )
Xtest, ytest = mat['Xtest'], mat['ytest']
pos = np.array([X[i] for i in range(X.shape[0]) if y[i] == 1])
neg = np.array([X[i] for i in range(X.shape[0]) if y[i] == 0])


In [36]:
# First we make an instance of an SVM with C=0.1 and 'linear' kernel
linear_svm = svm.SVC(C=0.1, kernel='linear')

# Now we fit the SVM to our X matrix, given the labels y
linear_svm.fit( X, y.flatten() )

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
train_predictions = linear_svm.predict(X).reshape((y.shape[0],1))
train_acc = 100. * float(sum(train_predictions == y))/y.shape[0]

In [38]:
print ('Training accuracy = %0.2f%%' % train_acc)

Training accuracy = 99.83%


In [39]:
test_predictions = linear_svm.predict(Xtest).reshape((ytest.shape[0],1))
test_acc = 100. * float(sum(test_predictions == ytest))/ytest.shape[0]
print ('Test set accuracy = %0.2f%%' % test_acc)

Test set accuracy = 98.90%


In [40]:
# Determine the words most likely to indicate an e-mail is a spam
# From the trained SVM we can get a list of the weight coefficients for each
# word (technically, each word index)

vocab_dict_flipped = getVocabDict(reverse=True)

#Sort indicies from most important to least-important (high to low weight)
sorted_indices = np.argsort( linear_svm.coef_, axis=None )[::-1]
print( "The 15 most important words to classify a spam e-mail are:")
print ([ vocab_dict_flipped[x] for x in sorted_indices[:15] ])

print ("The 15 least important words to classify a spam e-mail are:")
print ([ vocab_dict_flipped[x] for x in sorted_indices[-15:] ])
print

# Most common word (mostly to debug):
most_common_word = vocab_dict_flipped[sorted_indices[0]]
print( '# of spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(pos[:,1190]),pos.shape[0],  \
     100.*float(sum(pos[:,1190]))/pos.shape[0]))
print( '# of NON spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(neg[:,1190]),neg.shape[0],      \
100.*float(sum(neg[:,1190]))/neg.shape[0]))

The 15 most important words to classify a spam e-mail are:
['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'mortgag', 'natur', 'll', 'futur', 'hot']
The 15 least important words to classify a spam e-mail are:
['http', 'toll', 'xp', 'ratio', 'august', 'unsubscrib', 'useless', 'numberth', 'round', 'linux', 'datapow', 'wrong', 'urgent', 'that', 'spam']
# of spam containing "otherwis" = 804/1277 = 62.96%
# of NON spam containing "otherwis" = 301/2723 = 11.05%
