In [1]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support 

1. Read the data from the file
2. Separate the data into parameters and the corresponding labels
3. Convert the data and the lables into numpy arrays

In [2]:
data_file = open('spambase/spambase.data', 'r')

data = []
label = []

# load data from comma separated text file into lists in python
# while separating the parameters and the label of the data
for i in range(4061):
    data.append(data_file.readline().split(","))
    label.append(data[i][57])
    data[i] = data[i][:57]

# convert python lists to numpy array
data = np.asarray(data)
data = data.astype(float)
    
label = np.asarray(label)
label = label.astype(int)


4. Shuffle the data and split it for k-fold cross validation
    (Shuffling the data is necessary as the data contains all samples with label 1 initially followed by all        samples with label 0)
5. Use a linear SVM model object for training the model to classify samples as spam or not spam
6. Train the svm model on 4 folds keeping 1 fold for testing using svm.fit
7. Test the trained model on the test data
8. Use the confusion matrix to find the rates
    (False positive rate is the fraction of non-spam mails classified as spam,
     False negative rate is the fraction of spam mails classified as non-spam,
     Error rate is the total fraction of mails that are misclassified(either spam or non-spam))
9. Evaluate the f1-score

In [9]:
# Using linear SVM for email classification


k_fold = KFold(n_splits=5, shuffle=True)
k_fold.get_n_splits(data)
svm = []

fpr = []
fnr = []
error = []
f1 = []


for k, (train, test) in enumerate(k_fold.split(data, label)):
    
    svm.append(SVC(kernel='linear', C = 0.075, random_state=101))
    svm[k].fit(data[train], label[train])
    y_test = label[test]
    y_pred = svm[k].predict(data[test])
    
    conf_matrix = confusion_matrix(y_test,y_pred)  
    evaluation = precision_recall_fscore_support(y_test, y_pred)
    
    error.append(float( 1.0*( conf_matrix[0][1]+conf_matrix[1][0] ) / len(y_pred)))
    fpr.append(1.0 * conf_matrix[0][1]/(conf_matrix[0][1]+conf_matrix[1][1]))
    fnr.append(1.0 * conf_matrix[1][0]/(conf_matrix[1][0]+conf_matrix[0][0]))
    f1.append( (evaluation[2][0]+evaluation[2][1])/2 )
    

1. Print the evaluation results per fold

In [11]:
print 'Fold \t False Postive Rate\t False Negative Rate\t ErrorRate \tF1-score'

for i in range(5):
    print i, '\t\t', round(fpr[i], 5), '\t\t', round(fnr[i], 5), '\t', \
    round(error[i], 5), ' \t', round( f1[i], 5 )  

print '------------------------------------------------------------------------------'
print 'avg' '\t\t', round(sum(fpr)/5, 5), '\t\t', round(sum(fnr)/5, 5), ' \t',\
round(sum(error)/5, 5), ' \t', round(sum(f1)/5, 5)

Fold 	 False Postive Rate	 False Negative Rate	 ErrorRate 	F1-score
0 		0.07799 		0.05286 	0.06396  	0.93508
1 		0.06849 		0.06264 	0.06527  	0.93411
2 		0.08986 		0.07066 	0.07882  	0.91942
3 		0.05056 		0.07675 	0.06527  	0.93404
4 		0.07123 		0.06935 	0.0702  	0.92917
------------------------------------------------------------------------------
avg		0.07163 		0.06645  	0.0687  	0.93036
