In [1]:
import re
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.svm import SVC

from nltk.stem import PorterStemmer

# Reading Data  

In [2]:
spam_train = loadmat('Data/spamTrain.mat')
spam_test = loadmat('Data/spamTest.mat')

X_train, y_train = spam_train['X'], spam_train['y']
X_test, y_test = spam_test['Xtest'], spam_test['ytest']

email_sample = open('Data/emailSample1.txt').read()
spam_sample = open('Data/spamSample1.txt').read()

vocab_list = open('Data/vocab.txt').read()

# Preparing Vocabulary

In [3]:
vocab_list = vocab_list.split('\n')[:-1]

vocab_dic = {}
for i in vocab_list:
    value, key = i.split('\t')
    vocab_dic[key] = value

In [4]:
sample_words = ['my', 'email', 'address', 'is', 'not', 'here']
[{k, vocab_dic[k]} for k in sample_words]

[{'1084', 'my'},
 {'530', 'email'},
 {'25', 'address'},
 {'877', 'is'},
 {'1113', 'not'},
 {'775', 'here'}]

# Preprocessing emails 

In [5]:
stemmer = PorterStemmer()

In [6]:
def process_email(email_sample, vocab_dic):

    # Lower case 
    email_sample = email_sample.lower()
    
    # Replace numbers 
    email_sample = re.sub('[0 - 9] + ', 'number', email_sample)
    
    # Replace urls 
    email_sample = re.sub('[https|http]://[^\s]*', 'httpaddr', email_sample)
    
    # Replace email address 
    email_sample = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_sample)
    
    # Replace dollar sign 
    email_sample = re.sub('[$]+', 'dollar', email_sample)
    
    # Replace everything else 
    email_sample = re.sub('[^a-zA-Z\d\s]', '', email_sample)

    # Stemming  
    email_sample = [stemmer.stem(i) for i in email_sample.split(' ')]

    word_freq = []

    for word in email_sample:
        if len(word) > 1 and word in vocab_dic:
            word_freq.append(int(vocab_dic[word]))

    return word_freq

In [7]:
print(email_sample)
print(process_email(email_sample, vocab_dic))

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com


[86, 916, 794, 1077, 883, 370, 1699, 790, 1822, 883, 431, 1171, 794, 1002, 1895, 238, 162, 89, 688, 945, 1663, 1062, 1699, 375, 1162, 1510, 1182, 1237, 1895, 1440, 1547, 1758, 1896, 688, 1676, 992, 961, 1477, 71, 530]


# Extracting features from email

In [8]:
def email_features(email_sample, vocab_dic):
    
    features = np.zeros((len(vocab_dic), 1))
    word_freq = process_email(email_sample, vocab_dic)
    for i in word_freq:
        features[i] = 1
    return features

In [9]:
feature_samp_1 = email_features(email_sample, vocab_dic)

print('Length of feature vector: \t', len(feature_samp_1))
print('Number of non-zero entries: \t', feature_samp_1.sum())

Length of feature vector: 	 1899
Number of non-zero entries: 	 35.0


In [10]:
feature_samp_2 = email_features(spam_sample, vocab_dic)

print('Length of feature vector: \t', len(feature_samp_2))
print('Number of non-zero entries: \t', feature_samp_2.sum())

Length of feature vector: 	 1899
Number of non-zero entries: 	 42.0


# Training SVM 


In [11]:
model = SVC(C = 0.12, kernel = 'linear')
model.fit(X_train, y_train.ravel())

print('Training Accuracy: \t {} %'.format(model.score(X_train, y_train) * 100))
print('Test Accuracy: \t\t {} %'.format(model.score(X_test, y_test) * 100))

Training Accuracy: 	 99.9 %
Test Accuracy: 		 98.6 %


# Top predictor of spam 

In [12]:
# Switch key, values of vocab_dict and change to data frame. 
vocab_dic_1 = {y : x for x, y in vocab_dic.items()}
vocab_df = pd.DataFrame(vocab_dic_1, index = [0]).transpose()

# Add a new column in data frame with fit coefficients. 
vocab_df[1] = model.coef_.flatten()
vocab_df.columns = ['Word', 'Weight']

# Finding the top predictor 
top_predictor_df = vocab_df.sort_values(by = 'Weight', ascending = False)
top_predictor_df.head(10)

Unnamed: 0,Word,Weight
1191,our,0.519944
298,click,0.487389
1398,remov,0.431957
739,guarante,0.400727
1796,visit,0.386083
156,basenumb,0.351535
477,dollar,0.314385
1299,price,0.291478
1852,will,0.289202
966,lo,0.272873


# Predicting model 

In this section we check if the model is correctly classifying spam emails. We check on four emails, two of which are spam. 

In [13]:
email_sample_2 = open('Data/emailSample2.txt').read()
spam_sample_2 = open('Data/spamSample2.txt').read()

In [14]:
def predict(email_sample, vocab =  vocab_dic):
    feature = email_features(email_sample, vocab_dic)
    if model.predict(feature.T)[0] == 0:
        print('==============================================\n THIS EMAIL IS NOT SPAM.')
    else:
        print('==============================================\n THIS EMAIL IS SPAM.')

In [15]:
print(email_sample)
predict(email_sample)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com


 THIS EMAIL IS NOT SPAM.


In [16]:
print(spam_sample)
predict(spam_sample)

Do You Want To Make $1000 Or More Per Week?

If you are a motivated and qualified individual - I will personally demonstrate to you a system that will make you $1,000 per week or more! This is NOT mlm. Call our 24 hour pre-recorded number to get the details. 
000-456-789
I need people who want to make serious money.  Make the call and get the facts. Invest 2 minutes in yourself now!
000-456-789
Looking forward to your call and I will introduce you to people like yourself who are currently making $10,000 plus per week!
000-456-789

3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72
 THIS EMAIL IS SPAM.


In [17]:
print(email_sample_2)
predict(email_sample_2)

Folks,
 
my first time posting - have a bit of Unix experience, but am new to Linux.

 
Just got a new PC at home - Dell box with Windows XP. Added a second hard disk
for Linux. Partitioned the disk and have installed Suse 7.2 from CD, which went
fine except it didn't pick up my monitor.
 
I have a Dell branded E151FPp 15" LCD flat panel monitor and a nVidia GeForce4
Ti4200 video card, both of which are probably too new to feature in Suse's default
set. I downloaded a driver from the nVidia website and installed it using RPM.
Then I ran Sax2 (as was recommended in some postings I found on the net), but
it still doesn't feature my video card in the available list. What next?
 
Another problem. I have a Dell branded keyboard and if I hit Caps-Lock twice,
the whole machine crashes (in Linux, not Windows) - even the on/off switch is
inactive, leaving me to reach for the power cable instead.
 
If anyone can help me in any way with these probs., I'd be really grateful -
I've searched the 'ne

In [18]:
print(spam_sample_2)
predict(spam_sample_2)

Best Buy Viagra Generic Online

Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!

We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
http://medphysitcstech.ru



 THIS EMAIL IS SPAM.


# Checking with my own email

In [19]:
my_email_1 = open('Data/myEmail1.txt').read()
my_email_2 = open('Data/myEmail2.txt').read()

In [20]:
print(my_email_1)
predict(my_email_1)

Share your views and experiences with us

Dear Researcher, 
Growing emphasis on readership and impact is leading researchers to communicate about their work in new ways, to new audiences, from much earlier in the research process. With the support of AIP Publishing and some other publishers, Kudos is leading a study to understand these changes, with a view to providing better support and services. 
We invite you to take 10 minutes to share your views and experiences with us via an online survey. Survey participants will be entered into a prize drawing for online shopping vouchers. The prize value is £100 (or the equivalent in local currency, where possible). The survey is open until March 31st, 2019. 
To participate in the survey and enter the prize drawing, please click the button below.
Complete the survey today!
Many thanks in advance for your contribution! 

Best wishes, 
AIP Publishing and The Kudos Team.
 THIS EMAIL IS NOT SPAM.


In [21]:
print(my_email_2)
predict(my_email_2)

We have curated special offer just for you!

Dear ,
Our Special Offers let you stay in touch with your loved ones for less! Use ultra-fast data to make HD video calls, call your friends & family abroad at unbeatable rates, send unlimited texts & more! 
Limited time only. For T&C visit www.lycamobile.us?
Refer a friend and earn $5 credit!
 THIS EMAIL IS SPAM.
