In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.datasets as skd
from scipy.sparse import csc_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_recall_fscore_support

from sklearn import linear_model
from sklearn import naive_bayes

#Each sub-directory in the parent directory is assumed to contain documents from the same class
#I pre-processed the part1 (fold1) and part2 (fold2) of the lingspam dataset to place spam emails in one folder 
#and legit emails in another; you should do the same for the entire dataset, either manually or via a script. 
ls_train = skd.load_files('./data/lingspam_public/lemm_stop/train');
ls_test  = skd.load_files('./data/lingspam_public/lemm_stop/test');

#The count vectorizer classes fit_transform function generates a vocoabulary that contains each unique term in the dataset
#and outputs a sparse matrix tabulating term occurences
count_vect = CountVectorizer()
x_train = count_vect.fit_transform(ls_train.data)

#Since the vocabulary has already been learned, use the transform function to transform the test data using the same vocab
x_test = count_vect.transform(ls_test.data)

In [2]:
def feature_selection(N,x_train):
    # Prepare
    num_email   = x_train.shape[0]
    num_feature = x_train.shape[1]

    # Transport saved data from sparse matrix out
    x_train_data = x_train.toarray()
    x_train_ig   = np.zeros([num_feature])

    for i in range(num_feature):
        # Each colunm show the occurence of one feature in all emails
        feature_vector = x_train_data[:,i]
    
        # Reshape 
        feature_vector = feature_vector.reshape([num_email])
        
        # Calculate ig for features 
        x_train_ig [i] = mutual_info_score(feature_vector, ls_train.target)
        
    x_train_ig_sort = np.argsort(-x_train_ig)

    # Extract feature names
    name_feature = count_vect.get_feature_names()

    # Select N largest features' index
    top_feature  = np.array(x_train_ig_sort[:N])
    drop_feature = np.array(x_train_ig_sort[N:num_feature])
    
    return top_feature,drop_feature,name_feature

In [3]:
def binary_feature(ls_train):
    # Use the count vectorizer classes to get binary featrues
    # Set parmeter 'binary' to True, all non zero counts are set to 1
    count_vect_bf = CountVectorizer(binary=True)
    
    x_train_bf = count_vect_bf.fit_transform(ls_train.data)
    x_test_bf  = count_vect_bf.transform(ls_test.data)
    
    # Still drop the unwanted features in training set
    x_train_bf = x_train_bf[:, top_feature]
    x_test_bf  = x_test_bf[:, top_feature]
    
    return x_train_bf, x_test_bf

def term_frequency(ls_train):
    tf_transformer = TfidfTransformer(use_idf=False)

    x_train_tf = tf_transformer.fit_transform(x_train)
    x_test_tf = tf_transformer.transform(x_test)

    # Still drop the unwanted features in training set
    x_train_tf = x_train_tf[:, top_feature]
    x_test_tf  = x_test_tf[:, top_feature]
    
    return x_train_tf, x_test_tf

In [4]:
def pre_rec(yts, yhat):
    precision, recall, f1,_ = precision_recall_fscore_support(yts,
                                                              yhat,
                                                              average='binary')
    return precision,recall

In [6]:
# K-fold - split into 10 folds
# 100 top features
# Cross Validation
# Using binary feature

from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import SVC

top_feature, drop_feature, name_feature = feature_selection(100, x_train)

x_train_bf, x_test_bf = binary_feature(ls_train)
x_train_tf, x_test_tf = term_frequency(ls_train)

count_vect = CountVectorizer()
x_train = count_vect.fit_transform(ls_train.data)

nfold = 10
kf = KFold(n_splits=nfold,shuffle=True)

acc = []
pre = []
rec = []
svc = SVC(kernel="sigmoid", C=10, verbose=10)

for train, test in kf.split(x_train_bf):
    Xtr = x_train_bf[train,:]
    Xts = x_train_bf[test,:]
    ytr = ls_train.target[train]
    yts = ls_train.target[test]
    
    svc.fit(Xtr,ytr)
    yhat = svc.predict(Xts)
    
    acci = np.mean(yhat == yts)
    prei,reci = pre_rec(yts,yhat)
    
    acc.append(acci)
    pre.append(prei)
    rec.append(reci)

acc_mean = np.mean(acc)
pre_mean = np.mean(pre)
rec_mean = np.mean(rec)

print('\n')

print('Accuracy  = {0:f}'.format(acc_mean))
print('Precision = {0:f}'.format(pre_mean))
print('Recall    = {0:f}'.format(rec_mean))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

Accuracy  = 0.979254
Precision = 0.985364
Recall    = 0.888575
