In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import re
import os
import nltk, nltk.stem.porter
%matplotlib inline

# Spam classification

### Preprocessing emails

In [2]:
mail1 = open("data/emailSample1.txt", 'r').read()
print mail1

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




In [21]:
def preprocess(email):
    
    # Lower casing 
    email = email.lower()
    
    # Remove traling characters 
    email = re.sub('[\n\r]', '', email)
        
    # Stripping HTML
    email = re.sub('<[^<>]+>', '', email)
    
    # Normalizing URL
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    
    # Normalizing email addresses
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)    
    
    # Normalizing numbers
    email = re.sub('[0-9]+', 'number ', email)
    
    # Normalizing dollars
    email = re.sub('[$]+', 'dollar ', email)
    
    # Removal non-word
    email = re.sub('[^a-zA-Z0-9]', ' ', email);

    # Remove white space in excess
    email = re.sub('^\s', '', email)
    
    email = re.sub('\s\s+', ' ', email)

    #print email
    
    # Split
    tokens = re.split(' ', email)
    
    # Word stemming
    stemmer = nltk.stem.porter.PorterStemmer()
    
    token_list = []
    
    for word in tokens:
        
        token_list.append(str(stemmer.stem(word)))
    
    return token_list
    

In [30]:
#preprocess(mail1)

In [23]:
def get_vocab(file_name = "data/vocab.txt"):
    vocab_dict = {}
    with open("data/vocab.txt", 'r') as f:
        for line in f:
            (val, key) = line.split()
            vocab_dict[key] = int(val)
            
    return vocab_dict

In [24]:
vocab_dict = get_vocab()

In [25]:
def mail_to_vocab_idx(mail, vocab):
    idx = []
    token_list = preprocess(mail)
    idx = [ vocab[token] for token in token_list if token in vocab ] 
    return idx

In [26]:
idx_list=mail_to_vocab_idx(mail1,vocab_dict)

### Extracting features from email

In [27]:
def idx_to_features(idx_list, size = len(vocab_dict)):
    
    features = np.zeros(size)
    
    for idx in idx_list:
        features[idx] = 1
    
    return features

def email_to_features(mail, vocab_dict):
    idx_list = mail_to_vocab_idx(mail1,vocab_dict)    
    return idx_to_features(idx_list, len(vocab_dict))

In [31]:
mail1 = open("data/emailSample1.txt", 'r').read()
vocab_dict = get_vocab()
features = email_to_features(mail1,vocab_dict)
unique, counts = np.unique(features, return_counts=True)
print "Length of feature vector: %i" % len(features) 
print "Length of zero entries: %i " % counts[0]
print "Length of non-zero entries: %i " % counts[1]

Length of feature vector: 1899
Length of zero entries: 1854 
Length of non-zero entries: 45 


### Training SVM for Spam Classification

In [35]:
dataTrain=scipy.io.loadmat('data/spamTrain.mat')
X=dataTrain['X']
y=dataTrain['y']

dataTest=scipy.io.loadmat('data/spamTest.mat')
Xtest=dataTest['Xtest']
ytest=dataTest['ytest']

In [41]:
from sklearn import svm

svc = svm.SVC(C=0.1, kernel='linear', verbose=False)
svc.fit(X, y.flatten())

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
print "Training set accurancy: %.2f %%" % (100* svc.score(X,y))
print "Test set accurancy: %.2f %%" % (100* svc.score(Xtest,ytest))

Training set accurancy: 99.83 %
Test set accurancy: 98.90 %


In [110]:
# [::-1] revert the np array!!

print "The 15 most important words to classify a spam e-mail are:"
print [vocab_dict.keys()[vocab_dict.values().index(idx)] for idx in np.argsort(svc.coef_).flatten()[::-1][:15]]
print ""
print "The 15 least important words to classify a spam e-mail are:"
print [vocab_dict.keys()[vocab_dict.values().index(idx)] for idx in np.argsort(svc.coef_).flatten()[::-1][-15:]]
    

The 15 most important words to classify a spam e-mail are:
['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'mortgag', 'natur', 'll', 'futur', 'hot']

The 15 least important words to classify a spam e-mail are:
['http', 'toll', 'xp', 'ratio', 'august', 'unsubscrib', 'useless', 'numberth', 'round', 'linux', 'datapow', 'wrong', 'urgent', 'that', 'spam']
