In [1]:
# importing required packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf
from sklearn import preprocessing

In [37]:
#Loading the dataset
def loadData():
    data = []
    file = open("spambase.data.txt","r")
    lines = file.readlines()
    for line in lines:
        line = line.split(',')
        line_list = list(map(lambda x:float(x), line))
        data.append(line_list)
    return data

# function to get different measures like Overall Error, False Positive Rate, False Negative Rate
def getErrorRates(clf,X_test,y_test,y_predict):
    CM = confusion_matrix(y_test, y_predict)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    error = np.mean(y_test!=y_predict)
    fpr = FP/(FP+TN)
    fnr = FN/(TP+FN)
    acc = clf.score(X_test,y_test)
    return error,fpr,fnr,acc

In [3]:
# loading the dataset
data = loadData()

In [4]:
# Coluns of the dataset
columns = list(range(1,58))+['label']
df = pd.DataFrame.from_records(data, columns=columns)
X = df.iloc[:,0:-1] # total number of features
y = df.iloc[:,-1] # Corresponding Labels

In [5]:
# Standardizing the dataset
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [48]:
# Performing K-Fold Cross Validation
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)

# place holders for storing the scores of different metrics over all iterations
LRError, LRFPR, LRFNR, LRaccuracy, LRvalues = [], [], [], [], []
SVCError, SVCFPR, SVCFNR, SVCaccuracy, SVCvalues = [], [], [], [], []
NBError, NBFPR, NBFNR, NBaccuracy, NBvalues = [], [], [], [], []
KNNError, KNNFPR, KNNFNR, KNNaccuracy, KNNvalues = [], [], [], [], []
i = 1
for train_index, test_index in skf.split(X, y):  
    # Splitting the data into Train and Test sets for each Iteration/Fold
    X_train, X_test = X[train_index,:], X[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Tried Logistic Regression with L2 Loss also but L1 Loss gives lower False Positive Rate which is very much desired
    LRclf = LogisticRegression(penalty="l1",max_iter=1000,solver="liblinear").fit(X_train,y_train)
    y_predict_lr = LRclf.predict(X_test)
    
    # SVC model
    svc = SVC(gamma="auto",kernel="rbf").fit(X_train,y_train)
    y_predict_svc = svc.predict(X_test)
    
    # Naive Bayes model
    gnb = GaussianNB().fit(X_train,y_train)
    y_predict_gnb = gnb.predict(X_test)
    
    # K-NN model
    knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
    y_predict_knn = knn.predict(X_test)
    
    # Calling the getErrorRates function to get Overall Error, FPR, FNR and Accuracy scores
    LRerror,LRfpr,LRfnr,LRacc = getErrorRates(LRclf,X_test,y_test,y_predict_lr)
    SVCerror,SVCfpr,SVCfnr,SVCacc = getErrorRates(svc,X_test,y_test,y_predict_svc)
    NBerror, NBfpr,NBfnr,NBacc = getErrorRates(gnb,X_test,y_test,y_predict_gnb)
    KNNerror, KNNfpr,KNNfnr,KNNacc = getErrorRates(knn,X_test,y_test,y_predict_knn)
    
    LRError.append(LRerror*100)
    LRFPR.append(LRfpr*100)
    LRFNR.append(LRfnr*100)
    LRaccuracy.append(LRacc*100)
    LRvalues.append([LRfpr*100,LRfnr*100, LRerror*100,"Fold-"+str(i),LRacc*100])
    
    SVCError.append(SVCerror*100)
    SVCFPR.append(SVCfpr*100)
    SVCFNR.append(SVCfnr*100)
    SVCaccuracy.append(SVCacc*100)
    SVCvalues.append([SVCfpr*100,SVCfnr*100, SVCerror*100,"Fold-"+str(i),SVCacc*100])
    
    NBError.append(NBerror*100)
    NBFPR.append(NBfpr*100)
    NBFNR.append(NBfnr*100)
    NBaccuracy.append(NBacc*100)
    NBvalues.append([NBfpr*100,NBfnr*100, NBerror*100,"Fold-"+str(i),NBacc*100])
    
    KNNError.append(KNNerror*100)
    KNNFPR.append(KNNfpr*100)
    KNNFNR.append(KNNfnr*100)
    KNNaccuracy.append(KNNacc*100)
    KNNvalues.append([KNNfpr*100, KNNfnr*100, KNNerror*100,"Fold-"+str(i), KNNacc*100])
    
    i+=1
    
# Creating a dataframe for all four models showing one row per fold showing false positive, false negative, and overall 
# error rates, and add one final row corresponding to the average error rates across all folds.
LRvalues.append([np.mean(LRFPR),np.mean(LRFNR),np.mean(LRError),"Avg(Folds)",np.mean(LRaccuracy)])
SVCvalues.append([np.mean(SVCFPR),np.mean(SVCFNR),np.mean(SVCError),"Avg(Folds)",np.mean(SVCaccuracy)])
NBvalues.append([np.mean(NBFPR),np.mean(NBFNR),np.mean(NBError),"Avg(Folds)",np.mean(NBaccuracy)])
KNNvalues.append([np.mean(KNNFPR),np.mean(KNNFNR),np.mean(KNNError),"Avg(Folds)",np.mean(KNNaccuracy)])

columns = ["False Positive Rate (%)", "False Negative Rate (%)", "Overall Error Rate (%)","Folds","Accuracy (%)"]
LRscores = pd.DataFrame.from_records(LRvalues, columns=columns)
LRscores = LRscores.set_index("Folds")
SVCscores = pd.DataFrame.from_records(SVCvalues, columns=columns)
SVCscores = SVCscores.set_index("Folds")
NBscores = pd.DataFrame.from_records(NBvalues, columns=columns)
NBscores = NBscores.set_index("Folds")
KNNscores = pd.DataFrame.from_records(KNNvalues, columns=columns)
KNNscores = KNNscores.set_index("Folds")

In [19]:
print("Logistic Regression Results:")
LRscores

Logistic Regression Results:


Unnamed: 0_level_0,False Positive Rate (%),False Negative Rate (%),Overall Error Rate (%),Accuracy (%)
Folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fold-1,3.225806,13.186813,7.158351,92.841649
Fold-2,4.659498,13.736264,8.24295,91.75705
Fold-3,5.376344,12.087912,8.02603,91.97397
Fold-4,4.301075,8.287293,5.869565,94.130435
Fold-5,4.659498,8.839779,6.304348,93.695652
Fold-6,7.885305,5.524862,6.956522,93.043478
Fold-7,1.433692,9.392265,4.565217,95.434783
Fold-8,3.942652,9.392265,6.086957,93.913043
Fold-9,15.467626,14.917127,15.250545,84.749455
Fold-10,8.273381,20.441989,13.071895,86.928105


In [8]:
print("Support Vector Classifier Results:")
SVCscores

Unnamed: 0_level_0,False Positive Rate (%),False Negative Rate (%),Overall Error Rate (%),Accuracy (%)
Folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fold-1,5.376344,10.989011,7.592191,92.407809
Fold-2,4.659498,8.241758,6.073753,93.926247
Fold-3,3.584229,10.43956,6.290672,93.709328
Fold-4,3.584229,10.497238,6.304348,93.695652
Fold-5,3.584229,8.287293,5.434783,94.565217
Fold-6,4.301075,4.972376,4.565217,95.434783
Fold-7,0.358423,10.497238,4.347826,95.652174
Fold-8,2.867384,9.944751,5.652174,94.347826
Fold-9,8.633094,14.364641,10.893246,89.106754
Fold-10,6.115108,23.756906,13.071895,86.928105


In [9]:
print("Naive Bayes Results:")
NBscores

Unnamed: 0_level_0,False Positive Rate (%),False Negative Rate (%),Overall Error Rate (%),Accuracy (%)
Folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fold-1,23.297491,3.846154,15.618221,84.381779
Fold-2,21.146953,2.197802,13.665944,86.334056
Fold-3,17.921147,2.197802,11.713666,88.286334
Fold-4,22.580645,1.657459,14.347826,85.652174
Fold-5,15.770609,4.972376,11.521739,88.478261
Fold-6,27.956989,1.104972,17.391304,82.608696
Fold-7,27.240143,2.762431,17.608696,82.391304
Fold-8,18.637993,6.629834,13.913043,86.086957
Fold-9,57.194245,5.524862,36.819172,63.180828
Fold-10,39.568345,11.60221,28.540305,71.459695


In [49]:
print("K-Nearest Neighbor Results:")
KNNscores

K-Nearest Neighbor Results:


Unnamed: 0_level_0,False Positive Rate (%),False Negative Rate (%),Overall Error Rate (%),Accuracy (%)
Folds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fold-1,5.734767,20.879121,11.713666,88.286334
Fold-2,5.734767,13.186813,8.67679,91.32321
Fold-3,6.09319,14.285714,9.327549,90.672451
Fold-4,5.376344,13.259669,8.478261,91.521739
Fold-5,5.017921,12.707182,8.043478,91.956522
Fold-6,8.960573,7.734807,8.478261,91.521739
Fold-7,1.433692,12.707182,5.869565,94.130435
Fold-8,7.168459,12.707182,9.347826,90.652174
Fold-9,15.107914,16.574586,15.686275,84.313725
Fold-10,10.071942,25.414365,16.122004,83.877996


In [57]:
# Performing K-Fold Cross Validation
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)

# place holders for storing the scores of different metrics over all iterations
NNError, NNaccuracy, NNvalues = [], [], []

i = 1
for train_index, test_index in skf.split(X, y): 
    print("Fold"+str(i))
    # Splitting the data into Train and Test sets for each Iteration/Fold
    X_train, X_test = X[train_index,:], X[test_index,:]
    y_train, y_test = y.iloc[train_index], np.array(y.iloc[test_index])
    
    # Defining the architecture of the model
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(64, activation=tf.nn.elu,input_dim=57))
    model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu,input_dim=64))
    model.add(tf.keras.layers.Dense(1, activation=tf.nn.softmax,input_dim=128))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # training the model
    model.fit(X_train, y_train, epochs=5)
    
    # predicting the class for test data
    y_predict_nn = model.predict(X_test)
    
    NNloss, NNacc = model.evaluate(X_test,y_test)
    
    NNError.append(NNerror*100)
    NNaccuracy.append(NNacc*100)
    NNvalues.append([NNerror*100,"Fold-"+str(i),NNacc*100])
    
    i+=1
    
# Creating a dataframe for all four models showing one row per fold showing overall 
# error rates, and add one final row corresponding to the average error rates across all folds.
NNvalues.append([np.mean(NNError),"Avg(Folds)",np.mean(NNaccuracy)])

columns = ["Overall Error Rate (%)","Folds","Accuracy (%)"]
NNscores = pd.DataFrame.from_records(NNvalues, columns=columns)
NNscores = NNscores.set_index("Folds")

Fold1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold3
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold7
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold8
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold9
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [58]:
print("Neural Network Results:")
NNscores

Neural Network Results:


Unnamed: 0_level_0,Overall Error Rate (%),Accuracy (%)
Folds,Unnamed: 1_level_1,Unnamed: 2_level_1
Fold-1,60.566449,39.479393
Fold-2,60.566449,39.479393
Fold-3,60.566449,39.479393
Fold-4,60.566449,39.347826
Fold-5,60.566449,39.347826
Fold-6,60.566449,39.347826
Fold-7,60.566449,39.347826
Fold-8,60.566449,39.347826
Fold-9,60.566449,39.433551
Fold-10,60.566449,39.433551


In [55]:
values = []
values.append([np.mean(SVCFPR),np.mean(SVCFNR),np.mean(SVCError),np.mean(SVCaccuracy),"SVC"])
values.append([np.mean(LRFPR),np.mean(LRFNR),np.mean(LRError),np.mean(LRaccuracy),"Logistic Regression"])
values.append([np.mean(KNNFPR),np.mean(KNNFNR),np.mean(KNNError),np.mean(KNNaccuracy),"5-NN"])
values.append([np.mean(NBFPR),np.mean(NBFNR),np.mean(NBError),np.mean(NBaccuracy),"Naive Bayes"])
values.append(["","",np.mean(NNError),np.mean(NNaccuracy),"Neural Network"])
columns = [" Avg False Positive Rate (%)", " Avg False Negative Rate (%)", "Avg Overall Error Rate (%)","Avg Accuracy (%)","Model"]
scores = pd.DataFrame.from_records(values, columns=columns)
scores = scores.set_index("Model")

## Summary

In [56]:
scores

Unnamed: 0_level_0,Avg False Positive Rate (%),Avg False Negative Rate (%),Avg Overall Error Rate (%),Avg Accuracy (%)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVC,4.30636,11.1991,7.022611,92.977389
Logistic Regression,5.88665,11.5807,8.131499,91.868501
5-NN,7.06996,14.9457,10.174367,89.825633
Naive Bayes,27.1315,4.24959,18.113992,81.886008
Neural Network,,,60.566449,39.404465


## Observations:
### 1. Standardizing the data improves the overall accuracy and reduces the False Positive Rate as well.
### 2. Logistic Regression with L1 penalty is performing better than Logistic Regression with L2 penalty.
### 3. Neural Network is not giving good results when compared to other models. 