In [39]:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn import svm
import re
import nltk, nltk.stem.porter

In [153]:
#preprocess sample eamil
def processEmail(text):
    
    word_indices = []
    #lower case
    text = text.lower()
    #strip all html
    text = re.sub('<[^<>]+>', ' ', text)
    #0-9 to number
    text = re.sub('[0-9]+', 'number', text)
    #http:// to httpaddr
    text = re.sub('(http|https)://[^\s]*', 'httpaddr', text)
    # handle addresses
    text = re.sub('[^\s]+@[^\s]+', 'emailaddr', text)
    # handle sign
    text = re.sub('[$]+', 'dollar', text)
    return text

In [52]:
def email2TokenList(raw_text):
    stemmer = nltk.stem.porter.PorterStemmer()#提取词干
    email = processEmail(raw_text)
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    tokenlist = []
    for token in tokens: 
        #Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)
        #Use the Porter stemmer to stem the word
        stemmed = stemmer.stem(token)
        #Throw out empty tokens
        if not len(token): continue  
        #Store a list of all unique stemmed words
        tokenlist.append(stemmed)
    return tokenlist


In [51]:
def getVocabDict(reverse=False):
    vocab_dict = {}
    with open("vocab.txt") as f:
        for line in f:
            (val, key) = line.split()
            if not reverse:
                vocab_dict[key] = int(val)
            else:
                vocab_dict[int(val)] = key     
    return vocab_dict

In [55]:
def email2VocabIndices(raw_text, vocab_dict):
    tokenlist = email2TokenList(raw_text)
    index_list = [ vocab_dict[token] for token in tokenlist if token in vocab_dict ]
    return index_list

In [154]:
def wordIndices(file):
    with open(file, 'r') as f:
        word_indices = []
        vocab_list = getVocabDict()
        for i in f:
            a=email2VocabIndices(i, vocab_list)
            word_indices.extend(a)
    return word_indices

#2.2 Extracting Features from Emails

In [75]:
def emailFeatures(indices, vocab_dict):
    word_vec = np.zeros((len(vocab_dict), 1))
    for index in indices:
        word_vec[index, 0] = 1 
    return word_vec

In [157]:
word_indices = wordIndices('emailSample2.txt')
vocab_list = getVocabDict()
sample1_fea = emailFeatures(word_indices, vocab_list)
print(sample1_fea.shape, sample1_fea[np.where(sample1_fea==1)].shape)

(1899, 1) (128,)


#2.3 Training SVM for Spam Classification

In [95]:
data = sio.loadmat('spamTrain.mat')
data_test = sio.loadmat('spamTest.mat')

In [100]:
X = data['X']
y = data['y']
X_test = data_test['Xtest']
y_test = data_test['ytest']

In [90]:
email_svm = svm.SVC(C=1, kernel='linear')
email_svm.fit(X, y.flatten())

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [101]:
train_pred = email_svm.predict(X).reshape(len(y), 1)
train_acc = (train_pred==y).mean()
print('Train accuracy:', train_acc)
test_pred = email_svm.predict(X_test).reshape(len(y_test), 1)
test_acc = (test_pred==y_test).mean()
print('Test accuracy:', test_acc)

Train accuracy: 0.99975
Test accuracy: 0.978


#2.4 Top Predictors for Spam


In [147]:
vocab_dict_flipped = getVocabDict(reverse=True)

#Sort indicies from most important to least-important (high to low weight)
sorted_indices = np.argsort( -email_svm.coef_, axis=1 )
print("The 15 most important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[0,:15] ])
print("The 15 least important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[0,-15:] ])


# Most common word (mostly to debug):


The 15 most important words to classify a spam e-mail are:
['otherwis', 'flag', 'why', 'numberanumb', 'remot', 'visa', 'clearli', 'board', 'gt', 'seminar', 'technolog', 'institut', 'dollarac', 'titl', 'base']
The 15 least important words to classify a spam e-mail are:
['success', 'useless', 'that', 'http', 'et', 'urgent', 'instant', 'datapow', 'spam', 'steve', 'addit', 'kid', 'round', 'wrong', 'studi']


In [158]:
email_svm.predict(sample1_fea.T)

array([0], dtype=uint8)