In [124]:
%config IPCompleter.greedy=True
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io
import re
from sklearn import svm

from porterStemmer import porterStemmer

In [142]:
vocab = {}
r_vocab = {}
for line in open('vocab.txt'):
    items = line[:-1].split('\t')
    vocab[items[1]] = int(items[0])
    r_vocab[int(items[0])] = items[1]
    
print("vocab size:", len(vocab))

vocab size: 1899


In [126]:
def processEmail(email, vocab):

    # Init return value
    word_indices = []

    # Lower case
    email = email.lower()

    # Strip all HTML
    rx = re.compile('<[^<>]+>|\n')
    email = rx.sub(' ', email)
    # Handle Numbers
    rx = re.compile('[0-9]+')
    email = rx.sub('number ', email)

    # Handle URLS
    rx = re.compile('(http|https)://[^\s]*')
    email = rx.sub('httpaddr ', email)

    # Handle Email Addresses
    rx = re.compile('[^\s]+@[^\s]+')
    email = rx.sub('emailaddr ', email)

    # Handle $ sign
    rx = re.compile('[$]+')
    email = rx.sub('dollar ', email)

    # Process file
    l = 0

    # Remove any non alphanumeric characters
    rx = re.compile('[^a-zA-Z0-9 ]')
    email = rx.sub('', email).split()

    print(email)
    for word in email:

        # Tokenize and also get rid of any punctuation
        # str = re.split('[' + re.escape(' @$/#.-:&*+=[]?!(){},''">_<#')
        #                                + chr(10) + chr(13) + ']', str)

        # Stem the word
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        try:
            word = porterStemmer(word.strip())
        except:
            word = ''
            continue
            
        if len(word) < 1:
            continue
        
        if word not in vocab:
            continue
            
        word_indices.append(vocab[word])
        # Print to screen, ensuring that the output lines are not too long
        if (l + len(word) + 1) > 78:
            l = 0
        else:
            l = l + len(word) + 1


    return word_indices

In [127]:
email = """> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors youre expecting. This can be anywhere from less than 10 bucks a month to a couple of $100. You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 if youre running something big..
To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com"""
print(processEmail(email, vocab))

['anyone', 'knows', 'how', 'much', 'it', 'costs', 'to', 'host', 'a', 'web', 'portal', 'well', 'it', 'depends', 'on', 'how', 'many', 'visitors', 'youre', 'expecting', 'this', 'can', 'be', 'anywhere', 'from', 'less', 'than', 'number', 'bucks', 'a', 'month', 'to', 'a', 'couple', 'of', 'dollar', 'number', 'you', 'should', 'checkout', 'httpaddr', 'or', 'perhaps', 'amazon', 'ecnumber', 'if', 'youre', 'running', 'something', 'big', 'to', 'unsubscribe', 'yourself', 'from', 'this', 'mailing', 'list', 'send', 'an', 'email', 'to', 'emailaddr']
[86, 916, 794, 1077, 883, 370, 1699, 790, 1822, 1831, 883, 431, 1171, 794, 1002, 1895, 592, 1676, 238, 162, 89, 688, 945, 1663, 1120, 1062, 1699, 375, 1162, 477, 1120, 1893, 1510, 799, 1182, 1237, 810, 1895, 1440, 1547, 181, 1699, 1758, 1896, 688, 1676, 992, 961, 1477, 71, 530, 1699, 531]


In [132]:
train_data = scipy.io.loadmat('spamTrain.mat')
X = train_data['X']; y = train_data['y']
y[y=='y'] = 1
y[y=='n'] = 0
print(X.shape, y.shape)

test_data = scipy.io.loadmat('spamTest.mat')
Xtest = test_data['Xtest']
ytest = test_data['ytest']
ytest[ytest=='y'] = 1
ytest[ytest=='n'] = 0

(4000, 1899) (4000, 1)


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [None]:
C = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
sigma = C

best_score = 0.0
optimized = (0,0)
for c in C:
    for s in sigma:
        gamma = 1.0 / (2.0 * s ** 2)
        gauss_svm = svm.SVC(C=c, kernel='rbf', gamma=gamma)
        gauss_model = gauss_svm.fit(X, y.flatten())
        #print("C", c, "sigma", s, "score", gauss_model.score(X, y))
        score = gauss_model.score(Xtest, ytest)
        if score > best_score:
            best_score = score
            optimized = (c, s)
            
print("best_score:", best_score, "C:", optimized[0], "sigma:", optimized[1])

In [136]:
linear_svm = svm.SVC(C=0.1, kernel='linear')
model = linear_svm.fit(X, y)


  y = column_or_1d(y, warn=True)


0.99825

In [137]:
model.score(X, y)

0.99825

In [138]:
model.score(Xtest, ytest)

0.989

In [140]:
linear_svm.coef_.shape

(1, 1899)

In [171]:
top_predictor = np.argsort(linear_svm.coef_)
predictor = [(r_vocab[item], linear_svm.coef_[0][item]) for item in top_predictor.flatten()[-15:]]
predictor.reverse()
print(predictor)

help(np.argsort)

[('otherwis', 0.5006137361746403), ('clearli', 0.46591639068888796), ('remot', 0.4228691170610412), ('gt', 0.3836216017940651), ('visa', 0.367710398245535), ('base', 0.3450640979461706), ('doesn', 0.3236320357963838), ('wife', 0.2697241060373996), ('previous', 0.2672977146177069), ('player', 0.26116888670014904), ('mortgag', 0.2572981979518163), ('natur', 0.2539414551595328), ('ll', 0.25346652431419936), ('futur', 0.2482969904556866), ('hot', 0.24640435783158998)]
Help on function argsort in module numpy.core.fromnumeric:

argsort(a, axis=-1, kind='quicksort', order=None)
    Returns the indices that would sort an array.
    
    Perform an indirect sort along the given axis using the algorithm specified
    by the `kind` keyword. It returns an array of indices of the same shape as
    `a` that index data along the given axis in sorted order.
    
    Parameters
    ----------
    a : array_like
        Array to sort.
    axis : int or None, optional
        Axis along which to sort.  