### Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing Dataset

In [None]:
dataset = pd.read_csv('spambase.data')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

### Splitting into Test and Training Dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
print(X_train)

[[0.000e+00 6.200e-01 1.240e+00 ... 4.592e+00 1.210e+02 3.490e+02]
 [0.000e+00 5.500e-01 5.500e-01 ... 4.875e+00 1.400e+02 1.950e+02]
 [0.000e+00 0.000e+00 6.200e-01 ... 1.741e+00 1.400e+01 1.550e+02]
 ...
 [0.000e+00 0.000e+00 1.900e-01 ... 3.626e+00 4.400e+01 9.900e+02]
 [0.000e+00 0.000e+00 0.000e+00 ... 2.000e+00 7.000e+00 2.600e+01]
 [0.000e+00 2.000e-01 2.000e-01 ... 2.797e+00 1.270e+02 5.120e+02]]


In [None]:
print(y_train)

[1 1 1 ... 1 0 0]


In [None]:
print(X_test)

[[  0.      0.      0.81  ...  13.5    86.    189.   ]
 [  1.16    0.      0.    ...   1.476   7.     31.   ]
 [  0.      0.      0.    ...   4.     11.     68.   ]
 ...
 [  0.      0.      0.    ...   1.228   5.     43.   ]
 [  0.      0.      0.    ...   2.      7.     12.   ]
 [  0.      0.      0.    ...   1.166   2.      7.   ]]


In [None]:
print(y_test)

[1 0 0 ... 0 0 0]


### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)

[[-0.3447511   0.30370474  1.8914156  ... -0.01832036  0.54249063
   0.12664859]
 [-0.3447511   0.25076549  0.52763706 ... -0.01016034  0.68785706
  -0.15485277]
 [-0.3447511  -0.16518583  0.66599141 ... -0.10052613 -0.27615185
  -0.22797001]
 ...
 [-0.3447511  -0.16518583 -0.18389956 ... -0.04617402 -0.04662592
   1.29835231]
 [-0.3447511  -0.16518583 -0.55943278 ... -0.09305812 -0.3297079
  -0.4637731 ]
 [-0.3447511  -0.01393081 -0.16413466 ... -0.07007741  0.58839582
   0.42460133]]


In [None]:
print(X_test)

[[-0.3447511  -0.16518583  1.04152463 ...  0.23853301  0.27471038
  -0.16582036]
 [ 3.32924005 -0.16518583 -0.55943278 ... -0.10816714 -0.3297079
  -0.45463344]
 [-0.3447511  -0.16518583 -0.55943278 ... -0.0353901  -0.29910444
  -0.387     ]
 ...
 [-0.3447511  -0.16518583 -0.55943278 ... -0.11531798 -0.34500963
  -0.43269827]
 [-0.3447511  -0.16518583 -0.55943278 ... -0.09305812 -0.3297079
  -0.48936413]
 [-0.3447511  -0.16518583 -0.55943278 ... -0.11710569 -0.36796222
  -0.49850378]]


In [None]:
res = {}

## Naive Bayes

### Training model on training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

### Predicting Test results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [1 0]
 [1 0]
 [1 0]]


### Making confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : ")
print(cm)
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy : ", acc)

fpr = cm[0][1]/(cm[0][1] + cm[0][0])
fnr = cm[1][0]/(cm[1][0] + cm[1][1])
err = (cm[0][1] + cm[1][0])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])

res["Naive Bayes"] = [fpr, fnr, err, acc]

Confusion Matrix : 
[[502 190]
 [ 12 446]]

Accuracy :  0.8243478260869566


## KNN Model

### Training model on training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Predicting Test results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


### Making confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : ")
print(cm)
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy : ", acc)

fpr = cm[0][1]/(cm[0][1] + cm[0][0])
fnr = cm[1][0]/(cm[1][0] + cm[1][1])
err = (cm[0][1] + cm[1][0])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])

res["KNN"] = [fpr, fnr, err, acc]

Confusion Matrix : 
[[655  37]
 [ 64 394]]

Accuracy :  0.9121739130434783


## SVM Kernel

### Training model on training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

### Predicting Test results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


### Making confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : ")
print(cm)
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy : ", acc)

fpr = cm[0][1]/(cm[0][1] + cm[0][0])
fnr = cm[1][0]/(cm[1][0] + cm[1][1])
err = (cm[0][1] + cm[1][0])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])

res["SVM Kernel"] = [fpr, fnr, err, acc]

Confusion Matrix : 
[[665  27]
 [ 42 416]]

Accuracy :  0.94


## Decision Tree Classifier

### Training model on training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

### Predicting Test results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


### Making confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : ")
print(cm)
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy : ", acc)

fpr = cm[0][1]/(cm[0][1] + cm[0][0])
fnr = cm[1][0]/(cm[1][0] + cm[1][1])
err = (cm[0][1] + cm[1][0])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])

res["Decision Tree"] = [fpr, fnr, err, acc]

Confusion Matrix : 
[[641  51]
 [ 45 413]]

Accuracy :  0.9165217391304348


## Random Forest Classifier

### Training model on training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

### Predicting Test results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


### Making confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : ")
print(cm)
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy : ", acc)

fpr = cm[0][1]/(cm[0][1] + cm[0][0])
fnr = cm[1][0]/(cm[1][0] + cm[1][1])
err = (cm[0][1] + cm[1][0])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])

res["Random Forest"] = [fpr, fnr, err, acc]

Confusion Matrix : 
[[676  16]
 [ 31 427]]

Accuracy :  0.9591304347826087


## K-fold Cross-Validation

### Creating model on training set

In [None]:
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
k = 5
kf = KFold(n_splits=k, shuffle = True, random_state=1)
model = LogisticRegression(solver = 'liblinear')

### Training Model and computing Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

acc_score = []

cnt = 1
for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index,:],X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)

    acc = accuracy_score(y_test, pred_values)
    # acc_score.append(acc)

    cm = confusion_matrix(y_test, pred_values)
    print("K = ", cnt, ", Confusion Matrix : ")
    print(cm)
    # print("\nAccuracy : ", acc)
    
    fpr = cm[0][1]/(cm[0][1] + cm[0][0])
    fnr = cm[1][0]/(cm[1][0] + cm[1][1])
    err = (cm[0][1] + cm[1][0])/(cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])

    res["K-fold (k=" + str(cnt) + ")"] = [fpr, fnr, err, acc]
    cnt+=1

avg_acc_score = sum(acc_score)/k

K =  1 , Confusion Matrix : 
[[539  24]
 [ 46 311]]
K =  2 , Confusion Matrix : 
[[518  17]
 [ 42 343]]
K =  3 , Confusion Matrix : 
[[520  26]
 [ 50 324]]
K =  4 , Confusion Matrix : 
[[535  41]
 [ 35 309]]
K =  5 , Confusion Matrix : 
[[541  27]
 [ 42 310]]


.

.

.

## Tabulation and Analysis

In [None]:
print("{:<15}   {:<5}    {:<5}    {:<10}    {:<15}".format('Model','FPR','FNR', 'Error Rate', 'Accuracy'))
print()
for label, v in res.items():
    fpr, fnr, err, acc = v[0], v[1], v[2], v[3]
    print("{:<15}   {:.3f}    {:.3f}        {:.3f}       {:.3f}".format(label, fpr, fnr, err, acc))

Model             FPR      FNR      Error Rate    Accuracy       

Naive Bayes       0.275    0.026        0.176       0.824
KNN               0.053    0.140        0.088       0.912
SVM Kernel        0.039    0.092        0.060       0.940
Decision Tree     0.074    0.098        0.083       0.917
Random Forest     0.023    0.068        0.041       0.959
K-fold (k=1)      0.043    0.129        0.076       0.924
K-fold (k=2)      0.032    0.109        0.064       0.936
K-fold (k=3)      0.048    0.134        0.083       0.917
K-fold (k=4)      0.071    0.102        0.083       0.917
K-fold (k=5)      0.048    0.119        0.075       0.925


From the above table, we can see the error rate and accuracy of different models. Based on the table, we can assume that Random Forest Classifier gives the most reliable spam classification for our Spambase dataset. 

---



Random Forest classifier model has an error rate of 0.041, and in K-fold classifier, the row with fold = 2 has the lowest error rate of 0.064. 