In [79]:
%matplotlib inline

import math
import re
import numpy as np
import pandas as pd
from matplotlib import cm
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn import svm
import scipy.io

from stemmer import PorterStemmer

In [202]:
emailSample1 = ''
with open('./ex6/emailSample1.txt') as fp:
    emailSample1 = ''.join(line for line in fp)

vocabulary_by_str = {}
vocabulary_by_index = []
vocabulary_len = 0
with open('./ex6/vocab.txt') as fp:
    for line in fp:
        index, string = re.split('[\t ]', line.strip())
        vocabulary_by_str[string] = int(index) - 1
        vocabulary_len += 1
        vocabulary_by_index.append(string)

print(emailSample1)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




In [136]:
def compose(*functions):
    def inner(arg):
        for f in functions:
            arg = f(arg)
        return arg
    return inner

def toLowerCase(string):
    return string.lower()

def normalizeHTML(string):
    return re.sub('<[^<>]+>', '', string)

def normalizeUrls(string):
    return re.sub('(http|https)://[^\s]*', 'httpaddr', string)

def normalizeEmails(string):
    return re.sub('[^\s]+@[^\s]+', 'emailaddr', string)

def normalizeNumbers(string):
    return re.sub('[0-9]+', 'number', string)

def normalizeDollars(string):
    return re.sub('[$]+', 'dollar', string)

def stemWords(string):
    tokens = []
    stemmer = PorterStemmer()
    for token in re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', string):
        stripped = re.sub('[^a-zA-Z0-9]', '', token.strip());
        if stripped:
            word = stemmer.stem(stripped)
            tokens.append(word)
    
    return tokens

email_tokens = compose(
    toLowerCase, 
    normalizeHTML, 
    normalizeNumbers,
    normalizeUrls, 
    normalizeEmails, 
    normalizeDollars, 
    stemWords
)(emailSample1)

print(' '.join(email_tokens))

anyon know how much it cost to host a web portal well it depend on how mani visitor your expect thi can be anywher from less than number buck a month to a coupl of dollarnumb you should checkout httpaddr or perhap amazon ecnumb if your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr


In [179]:
def extract_features(email_tokens):
    X = np.zeros(vocabulary_len)
    
    for token in email_tokens:
        if token in vocabulary_by_str:
            X[vocabulary_by_str[token]] = 1
    
    return X

mat = scipy.io.loadmat('./ex6/spamTrain.mat')
X = mat.get('X')
y = mat.get('y').ravel()

mat = scipy.io.loadmat('./ex6/spamTest.mat')
Xtest = mat.get('Xtest')
ytest = mat.get('ytest').ravel()

In [210]:
np.set_printoptions(threshold=np.nan)


clf = svm.SVC(kernel='linear', C=0.1)
clf.fit(X, y)

train_predictions = clf.predict(X)
test_predictions = clf.predict(Xtest)

print('Training Accuracy: %s' % (np.mean(train_predictions == y) * 100))
print('Test Accuracy: %s' % (np.mean(test_predictions == ytest) * 100))

Training Accuracy: 99.825
Test Accuracy: 98.9


In [209]:
coef = clf.coef_.ravel()
indices = np.argsort(coef)[-20:]

print('Top predictors of spam:')
for index in indices:
    print('%-15s %-15s' % (vocabulary_by_index[index], coef[index]))

Top predictors of spam:
dollarnumb      0.22963916284457375
want            0.23194709265954988
se              0.2329549624596594
da              0.2332612152321099
al              0.23731066817215565
hour            0.24640435783158998
ga              0.24829699045568662
lo              0.25346652431419925
nbsp            0.2539414551595324
most            0.2572981979518164
pleas           0.2611688867001495
price           0.2672977146177071
will            0.2697241060374008
dollar          0.3236320357963838
basenumb        0.3450640979461706
visit           0.367710398245535
guarante        0.38362160179406524
remov           0.42286911706104086
click           0.465916390688888
our             0.5006137361746403
