In [1]:
#Simple NB Based Lingspam Spam classifier 
import numpy as np
import pandas as pd
import sklearn
import sklearn.datasets as skd
from scipy.sparse import csc_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn import linear_model
from sklearn import naive_bayes

#Each sub-directory in the parent directory is assumed to contain documents from the same class
#I pre-processed the part1 (fold1) and part2 (fold2) of the lingspam dataset to place spam emails in one folder 
#and legit emails in another; you should do the same for the entire dataset, either manually or via a script. 
ls_train = skd.load_files('./data/lingspam_public/lemm_stop/train');
ls_test  = skd.load_files('./data/lingspam_public/lemm_stop/test');

#The count vectorizer classes fit_transform function generates a vocoabulary that contains each unique term in the dataset
#and outputs a sparse matrix tabulating term occurences
count_vect = CountVectorizer()
x_train = count_vect.fit_transform(ls_train.data)

#Since the vocabulary has already been learned, use the transform function to transform the test data using the same vocab
x_test = count_vect.transform(ls_test.data)

Codes for feature selection.

In [2]:
def feature_selection(N,x_train):
    # Prepare
    num_email   = x_train.shape[0]
    num_feature = x_train.shape[1]

    # Transport saved data from sparse matrix out
    x_train_data = x_train.toarray()
    x_train_ig   = np.zeros([num_feature])

    for i in range(num_feature):
        # Each colunm show the occurence of one feature in all emails
        feature_vector = x_train_data[:,i]
    
        # Reshape 
        feature_vector = feature_vector.reshape([num_email])
        
        # Calculate ig for features 
        x_train_ig [i] = mutual_info_score(feature_vector, ls_train.target)
        
    x_train_ig_sort = np.argsort(-x_train_ig)

    # Extract feature names
    name_feature = count_vect.get_feature_names()

    # Select N largest features' index
    top_feature  = np.array(x_train_ig_sort[:N])
    drop_feature = np.array(x_train_ig_sort[N:num_feature])
    
    return top_feature,drop_feature,name_feature

def print_feature(top_feature,name_feature):
    for i in range(len(top_feature)):
        print(name_feature[top_feature[i]])

In [3]:
N = [10, 100, 1000]
for i,n in enumerate(N):
    top_feature, drop_feature, name_feature = feature_selection(n, x_train)
    print_feature(top_feature,name_feature)
    print('\n')

language
free
remove
linguistic
university
money
our
click
business
market


language
free
remove
linguistic
university
money
our
click
business
market
today
internet
over
check
product
order
sell
advertise
company
million
100
day
want
english
easy
best
income
linguistics
save
every
receive
guarantee
thousand
service
bulk
mail
com
buy
cash
purchase
ll
cost
win
edu
start
dollar
address
fax
mailing
offer
yourself
list
papers
month
hour
earn
hundred
linguist
live
success
here
week
20
theory
pay
email
conference
credit
ever
customer
card
send
profit
abstract
yours
financial
need
fun
speaker
watch
home
name
sale
bonus
zip
discussion
toll
instruction
simply
syntax
amaze
online
anywhere
investment
off
site
department
ad
program
friend


language
free
remove
linguistic
university
money
our
click
business
market
today
internet
over
check
product
order
sell
advertise
company
million
100
day
want
english
easy
best
income
linguistics
save
every
receive
guarantee
thousand
service
bulk
mail
com
buy


Codes for feature generation.

In [4]:
def binary_feature(ls_train, ls_test):
    # Use the count vectorizer classes to get binary featrues
    # Set parmeter 'binary' to True, all non zero counts are set to 1
    count_vect_bf = CountVectorizer(binary=True)
    
    x_train_bf = count_vect_bf.fit_transform(ls_train.data)
    x_test_bf  = count_vect_bf.transform(ls_test.data)
    
    # Still drop the unwanted features in training set
    x_train_bf = x_train_bf[:, top_feature]
    x_test_bf  = x_test_bf[:, top_feature]
    
    return x_train_bf, x_test_bf

def term_frequency(ls_train, ls_test):
    
    x_train_tf = x_train
    x_test_tf = x_test


    # Still drop the unwanted features in training set
    x_train_tf = x_train_tf[:, top_feature]
    x_test_tf  = x_test_tf[:, top_feature]
    
    return x_train_tf, x_test_tf

In [5]:
ls_train = skd.load_files('./data/lingspam_public/lemm_stop/train');
ls_test  = skd.load_files('./data/lingspam_public/lemm_stop/test');

x_train_bf, x_test_bf = binary_feature(ls_train, ls_test)
x_train_tf, x_test_tf = term_frequency(ls_train, ls_test)

Codes for applying classifiers.

In [6]:
def spam_classifiers(x_train_bf, x_train_tf, x_test_tf, x_test_bf, ls_test):
    
    # Multinomial NB with TF features
    mNomTF = sklearn.naive_bayes.MultinomialNB()
    mNomTF.fit(x_train_tf,ls_train.target)
    
    # Multinomial NB with BF features
    mNomBF = sklearn.naive_bayes.MultinomialNB()
    mNomBF.fit(x_train_bf,ls_train.target)

    #Bernoulli NB classifier with BF features
    bNolNB = sklearn.naive_bayes.BernoulliNB()
    bNolNB.fit(x_train_bf,ls_train.target)
    
    # Predict on testind data
    y_predict_M_TF = mNomTF.predict(x_test_tf)
    y_predict_M_BF = mNomBF.predict(x_test_bf)
    y_predict_B_BF = bNolNB.predict(x_test_bf)
    
    return y_predict_B_BF, y_predict_M_BF, y_predict_M_TF, mNomTF

In [7]:
y_predict_B_BF, y_predict_M_BF, y_predict_M_TF,mNomTF = spam_classifiers(x_train_bf, 
                                                                  x_train_tf, 
                                                                  x_test_tf, 
                                                                  x_test_bf, 
                                                                  ls_test)

Codes for calculate precision and recall.

In [8]:
from sklearn.metrics import precision_recall_fscore_support

def pre_rec(yts, yhat):
    precision, recall, f1,_ = precision_recall_fscore_support(yts,
                                                              yhat,
                                                              average='binary')
    return precision,recall

### Final Codes for lad results

In [9]:
N = [10, 100, 1000]
pre_matrix = np.zeros([3,3])
rec_matrix = np.zeros([3,3])

for i,n in enumerate(N):
    top_feature, drop_feature, name_feature = feature_selection(n, x_train)

    x_train_bf, x_test_bf = binary_feature(ls_train, ls_test)
    x_train_tf, x_test_tf = term_frequency(ls_train, ls_test)
    
    y_predict_B_BF, y_predict_M_BF, y_predict_M_TF,mNomTF = spam_classifiers(x_train_bf, 
                                                                x_train_tf, 
                                                                x_test_tf, 
                                                                x_test_bf, 
                                                                ls_test)
    if (n==1000):
        eval_feature_1000 = top_feature
    

    # Multinomial NB with TF
    pre1,rec1 = pre_rec(ls_test.target,y_predict_M_TF)
    
    #Multinomial NB with BF
    pre2,rec2 = pre_rec(ls_test.target,y_predict_M_BF)
        
    #Bernoulli NB classifier with BF
    pre3,rec3 = pre_rec(ls_test.target,y_predict_B_BF)
    
    pre_matrix[:,i] = [pre1, pre2, pre3]
    rec_matrix[:,i] = [rec1, rec2, rec3]

Write precision and recall matrices to dataframes.

In [10]:
classifier = [str('Multinomial NB with TF'),
              str('Multinomial NB with BF'), 
              str('Bernoulli NB with BF')]
featurenumber = [10, 100, 1000]

df_pre = pd.DataFrame(data = pre_matrix[0:,0:],
                      index = classifier,
                      columns = featurenumber)
df_pre.columns.name = 'Number of Featrues' 
df_pre.name = 'Precision Table'

with open('precisiontable.tex','w') as tf:
    tf.write(df_pre.to_latex())
    
df_pre.head()

Number of Featrues,10,100,1000
Multinomial NB with TF,0.851852,0.959184,1.0
Multinomial NB with BF,0.888889,0.977778,1.0
Bernoulli NB with BF,0.869565,0.939394,1.0


In [11]:
df_rec = pd.DataFrame(data = rec_matrix[0:,0:],
                      index = classifier,
                      columns = featurenumber)
df_rec.columns.name = 'Number of Featrues' 

with open('recalltable.tex','w') as tf:
    tf.write(df_rec.to_latex())
    
df_rec.head()

Number of Featrues,10,100,1000
Multinomial NB with TF,0.938776,0.959184,0.938776
Multinomial NB with BF,0.816327,0.897959,0.938776
Bernoulli NB with BF,0.816327,0.632653,0.612245


### Prediction on Eval Set

In [12]:
# Import eval set
eval_set = sklearn.datasets.load_files('./eval', shuffle=False)

# Create Vocabulary from training set
tf_transformer_new = TfidfTransformer(use_idf=False)
x_train_tf = tf_transformer_new.fit_transform(x_train)
x_train = count_vect.fit_transform(ls_train.data)

# Transform eval data to term frequency
eval_set_data = count_vect.transform(eval_set.data)

# Select top 10 featrues
eval_set_data = eval_set_data[:,top_feature]

# Predict
eval_set_predict = mNomTF.predict(eval_set_data)

# Save the Result to Text file
np.savetxt("./eval/results_new.txt", eval_set_predict, newline="\r\n", fmt="%d")

print(eval_set_predict)

[0 0 1 1]
