In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

from scipy import optimize
from scipy.io import loadmat

In [2]:
import os
list(os.listdir())

['.ipynb_checkpoints',
 'dataset3Params.m',
 'emailFeatures.m',
 'emailSample1.txt',
 'emailSample2.txt',
 'ex6.m',
 'ex6data1.mat',
 'ex6data2.mat',
 'ex6data3.mat',
 'ex6_spam.m',
 'gaussianKernel.m',
 'getVocabList.m',
 'lib',
 'linearKernel.m',
 'plotData.m',
 'porterStemmer.m',
 'processEmail.m',
 'readFile.m',
 'Spam Filter.ipynb',
 'spamSample1.txt',
 'spamSample2.txt',
 'spamTest.mat',
 'spamTrain.mat',
 'submit.m',
 'SVM .ipynb',
 'svmPredict.m',
 'svmTrain.m',
 'visualizeBoundary.m',
 'visualizeBoundaryLinear.m',
 'vocab.txt']

In [3]:
with open('emailSample1.txt', 'r') as f:
    samp_email = f.read()

In [4]:
print(samp_email)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




In [5]:
def process_email(email):
    
    import re
    import string
    from nltk.stem import PorterStemmer
    
    
    # lower the text
    email = email.lower()
    
    # remove all the numbers
    email = re.sub(r'[0-9]+','number',email)
    
    # remove html tags
    email = re.sub(r'[>|<]+', '', email)
    
    # replace emails with emailaddr
    email = re.sub(r'[^\s]+@[^\s]+','emailaddr',email )
    
    # replace all the links with httpaddr
    email = re.sub(r'(http|https)://[^\s]*','emailaddr',email )

    # replace $ sign with the string dollar
    email = re.sub(r'[$]+', 'dollar ', email)

    email = ''.join([s for s in email if s not in string.punctuation])
    
    
    stemmer = PorterStemmer()
    stemmed = []
    
    for word in email.split():
        stemmed.append(stemmer.stem(re.sub(r'[^a-zA-Z0-9]', '', word)))
    
    
    
    return ' '.join(stemmed)

In [6]:
P_sampemail = process_email(samp_email)

In [7]:
print(P_sampemail)

anyon know how much it cost to host a web portal well it depend on how mani visitor your expect thi can be anywher from less than number buck a month to a coupl of dollar number you should checkout emailaddr or perhap amazon ecnumb if your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr


In [8]:
vocab_df = pd.read_csv('vocab.txt',sep='\t', header=None)
vocab_df.columns = ['index', 'word']


In [9]:
vocab_df.sample(8)

Unnamed: 0,index,word
1826,1827,week
1156,1157,obtain
263,264,central
1023,1024,meet
615,616,far
274,275,charact
1441,1442,safe
1168,1169,ok


In [10]:
def get_indices(email):
    indices = []
    for i in email.split():
        match = vocab_df[vocab_df['word']==i]['index'].values
        if len(match) > 0:
            indices.append(match[0])

    indices = np.array(indices).reshape(1,-1)      
    return indices   

In [11]:
indices = get_indices(P_sampemail)

In [12]:
def email_features(indices):
    n = vocab_df.shape[0]
    x = np.zeros((n,1))
    for i in indices[0]:
        x[i] = 1
    return x

In [13]:
samp_ind_ = email_features(indices)

In [14]:
print('Length of the feature vector is {}'.format(len(samp_ind_)))
print('Number of non-zero entries in sample email is {}'.format(sum(samp_ind_==1)[0]))


Length of the feature vector is 1899
Number of non-zero entries in sample email is 43


# Bulding a model to classify emails

In [15]:
data = loadmat('spamTrain.mat')
data_test = loadmat('spamTest.mat')

In [16]:
X = data['X']
y = data['y']

X_test = data_test['Xtest']
y_test = data_test['ytest']

In [17]:
X.shape, y.shape, X_test.shape, y_test.shape

((4000, 1899), (4000, 1), (1000, 1899), (1000, 1))

In [18]:
from sklearn.svm import SVC

In [19]:
model = SVC(C=0.1, kernel='linear')
model.fit(X,y)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
print('Training accuracy is {}'.format(model.score(X,y)))
print('Testing data accuracy is {}'.format(model.score(X_test,y_test)))

Training accuracy is 0.99825
Testing data accuracy is 0.989


In [21]:
vocab_df['weights'] = model.coef_.reshape(-1,1)

In [22]:
vocab_df.sort_values('weights',ascending=False)[:15]

Unnamed: 0,index,word,weights
1190,1191,our,0.500614
297,298,click,0.465916
1397,1398,remov,0.422869
738,739,guarante,0.383622
1795,1796,visit,0.36771
155,156,basenumb,0.345064
476,477,dollar,0.323632
1851,1852,will,0.269724
1298,1299,price,0.267298
1263,1264,pleas,0.261169


# Prediciting whether a sample email is spam or ham

In [23]:
with open('spamSample2.txt', 'r') as f:
    sample = f.read()

In [24]:
processed = process_email(sample)

In [25]:
indices_ = get_indices(processed)

In [26]:
feats = email_features(indices_)

In [27]:
print('Number of non-zero entries in sample email is {}'.format(sum(feats==1)[0]))

Number of non-zero entries in sample email is 18


In [28]:
pred1 = model.predict(feats.reshape(1,-1))

In [48]:
print(processed)
print()
print('Spam Classification')
print(pred1)

best buy viagra gener onlin viagra numbermg x number pill dollar number free pill reorder discount top sell number qualiti satisfact guarante we accept visa master echeck payment number satisfi custom emailaddr

Spam Classification
[1]


# Testing on an another email

In [30]:
samp_email

"> Anyone knows how much it costs to host a web portal ?\n>\nWell, it depends on how many visitors you're expecting.\nThis can be anywhere from less than 10 bucks a month to a couple of $100. \nYou should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \nif youre running something big..\n\nTo unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

In [31]:
processed1 = process_email(samp_email)
indices1 = get_indices(processed1)
feats = email_features(indices1)

In [32]:
print('Number of non-zero entries in sample email is {}'.format(sum(feats==1)[0]))

Number of non-zero entries in sample email is 43


In [33]:
pred2 = model.predict(feats.reshape(1,-1))

In [47]:
print(processed1)
print()
print('Spam Classification')
print(pred2)

anyon know how much it cost to host a web portal well it depend on how mani visitor your expect thi can be anywher from less than number buck a month to a coupl of dollar number you should checkout emailaddr or perhap amazon ecnumb if your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr

Spam Classification
[0]


# Testing on an another email

In [35]:
with open('spamSample1.txt', 'r') as f:
    sample = f.read()


In [36]:
print(sample)

Do You Want To Make $1000 Or More Per Week?

 

If you are a motivated and qualified individual - I 
will personally demonstrate to you a system that will 
make you $1,000 per week or more! This is NOT mlm.

 

Call our 24 hour pre-recorded number to get the 
details.  

 

000-456-789

 

I need people who want to make serious money.  Make 
the call and get the facts. 

Invest 2 minutes in yourself now!

 

000-456-789

 

Looking forward to your call and I will introduce you 
to people like yourself who
are currently making $10,000 plus per week!

 

000-456-789



3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72




In [37]:
processed2 = process_email(sample)
indices2 = get_indices(processed2)
feats2 = email_features(indices2)



In [38]:
print('Number of non-zero entries in sample email is {}'.format(sum(feats2==1)[0]))

Number of non-zero entries in sample email is 46


In [46]:
pred3 = model.predict(feats2.reshape(1,-1))

print(processed2)
print()
print('Spam Classification')
print(pred3)

do you want to make dollar number or more per week if you are a motiv and qualifi individu i will person demonstr to you a system that will make you dollar numbernumb per week or more thi is not mlm call our number hour prerecord number to get the detail numbernumbernumb i need peopl who want to make seriou money make the call and get the fact invest number minut in yourself now numbernumbernumb look forward to your call and i will introduc you to peopl like yourself who are current make dollar numbernumb plu per week numbernumbernumb numberljgvnumbernumberleannumberlrmsnumbernumberwxhonumberqiytnumbernumberrjuvnumberhqcfnumbernumbereidbnumberdmtvlnumb

Spam Classification
[1]


# Testing on an another email

In [40]:
f = open('emailSample2.txt', 'r')
sample = f.read()

In [41]:
print(sample)

Folks,
 
my first time posting - have a bit of Unix experience, but am new to Linux.

 
Just got a new PC at home - Dell box with Windows XP. Added a second hard disk
for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went
fine except it didn't pick up my monitor.
 
I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4
Ti4200 video card, both of which are probably too new to feature in Suse's default
set. I downloaded a driver from the nVidia website and installed it using RPM.
Then I ran Sax2 (as was recommended in some postings I found on the net), but
it still doesn't feature my video card in the available list. What next?
 
Another problem. I have a Dell branded keyboard and if I hit Caps-Lock twice,
the whole machine crashes (in Linux, not Windows) - even the on/off switch is
inactive, leaving me to reach for the power cable instead.
 
If anyone can help me in any way with these probs., I'd be really grateful -
I've searched the 'ne

In [42]:
processed3 = process_email(sample)
indices3 = get_indices(processed3)
feats3 = email_features(indices3)

In [43]:
print('Number of non-zero entries in sample email is {}'.format(sum(feats3==1)[0]))

Number of non-zero entries in sample email is 121


In [45]:
pred4 = model.predict(feats3.reshape(1,-1))

print(processed3)
print()
print('Spam Classification')
print(pred4)

folk my first time post have a bit of unix experi but am new to linux just got a new pc at home dell box with window xp ad a second hard disk for linux partit the disk and have instal suse numbernumb from cd which went fine except it didnt pick up my monitor i have a dell brand enumberfpp number lcd flat panel monitor and a nvidia geforcenumb tinumb video card both of which are probabl too new to featur in suse default set i download a driver from the nvidia websit and instal it use rpm then i ran saxnumb as wa recommend in some post i found on the net but it still doesnt featur my video card in the avail list what next anoth problem i have a dell brand keyboard and if i hit capslock twice the whole machin crash in linux not window even the onoff switch is inact leav me to reach for the power cabl instead if anyon can help me in ani way with these prob id be realli grate ive search the net but have run out of idea or should i be go for a differ version of linux such as redhat opinion w