# Module 5 : Performance measurements of Naive Bayes and K-Nearest Neighbor

In [1]:
#import library packages
import pandas as p
import matplotlib.pyplot as plt
import seaborn as s
import numpy as n

In [2]:
#Load given dataset
data = p.read_csv('spam.csv', encoding='latin-1')
del data["Unnamed: 2"]
del data["Unnamed: 3"]
del data["Unnamed: 4"]
data = data.rename(columns={"v1":"label", "v2":"text"})
df=data.dropna()

In [3]:
from sklearn.preprocessing import LabelEncoder
var_mod = ['text']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i]).astype(str)

In [4]:
#According to the cross-validated MCC scores, the random forest is the best-performing model, so now let's evaluate its performance on the test set.
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, cohen_kappa_score, accuracy_score, average_precision_score, roc_auc_score

In [5]:
X = df.drop(labels='label', axis=1)
#Response variable
y = df.loc[:,'label']    

In [6]:
del df 
#We'll use a test size of 30%. We also stratify the split on the response variable, which is very important to do because there are so few fraudulent transactions.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [7]:
#for our convienient we delete X,y variable for differentiate confusion
del X, y

In [8]:
# Prevent view warnings
X_train.is_copy = False
X_test.is_copy = False

Naive Bayes:

In [9]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train,y_train)

predictR = gnb.predict(X_test)

print("")
print('Classification report of Naive Bayes Results:')
print("")

print(classification_report(y_test,predictR))
x = (accuracy_score(y_test,predictR)*100)

print('Accuracy result of Naive Bayes is:', x)
print("")
cm1=confusion_matrix(y_test,predictR)
print('Confusion Matrix result of Naive Bayes is:\n',cm1)
print("")
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity1 )
print("")
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity1)
print("")



Classification report of Naive Bayes Results:

              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1448
        spam       0.00      0.00      0.00       224

    accuracy                           0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672

Accuracy result of Naive Bayes is: 86.60287081339713

Confusion Matrix result of Naive Bayes is:
 [[1448    0]
 [ 224    0]]

Sensitivity :  1.0

Specificity :  0.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
TN = cm1[0][0]
FN = cm1[1][0]
TP = cm1[1][1]
FP = cm1[0][1]
print("True Positive :",TP)
print("True Negative :",TN)
print("False Positive :",FP)
print("False Negative :",FN)
print("")
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
print("True Positive Rate :",TPR)
print("True Negative Rate :",TNR)
print("False Positive Rate :",FPR)
print("False Negative Rate :",FNR)
print("")
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
print("Positive Predictive Value :",PPV)
print("Negative predictive value :",NPV)

True Positive : 0
True Negative : 1448
False Positive : 0
False Negative : 224

True Positive Rate : 0.0
True Negative Rate : 1.0
False Positive Rate : 0.0
False Negative Rate : 1.0

Positive Predictive Value : nan
Negative predictive value : 0.8660287081339713


  PPV = TP/(TP+FP)


K-Nearest Neighbor:

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knnc = KNeighborsClassifier()

knnc.fit(X_train,y_train)

predictR = knnc.predict(X_test)

print("")
print('Classification report of K-Nearest Neighbor Results:')
print("")

print(classification_report(y_test,predictR))
x = (accuracy_score(y_test,predictR)*100)

print('Accuracy result of K-Nearest Neighbor is:', x)
print("")
cm2=confusion_matrix(y_test,predictR)
print('Confusion Matrix result of K-Nearest Neighbor is:\n',cm2)
print("")
sensitivity1 = cm2[0,0]/(cm2[0,0]+cm2[0,1])
print('Sensitivity : ', sensitivity1 )
print("")
specificity1 = cm2[1,1]/(cm2[1,0]+cm2[1,1])
print('Specificity : ', specificity1)
print("")



Classification report of K-Nearest Neighbor Results:

              precision    recall  f1-score   support

         ham       0.93      0.97      0.95      1448
        spam       0.73      0.52      0.61       224

    accuracy                           0.91      1672
   macro avg       0.83      0.74      0.78      1672
weighted avg       0.90      0.91      0.90      1672

Accuracy result of K-Nearest Neighbor is: 91.02870813397129

Confusion Matrix result of K-Nearest Neighbor is:
 [[1406   42]
 [ 108  116]]

Sensitivity :  0.9709944751381215

Specificity :  0.5178571428571429



In [12]:
TN = cm2[0][0]
FN = cm2[1][0]
TP = cm2[1][1]
FP = cm2[0][1]
print("True Positive :",TP)
print("True Negative :",TN)
print("False Positive :",FP)
print("False Negative :",FN)
print("")
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
print("True Positive Rate :",TPR)
print("True Negative Rate :",TNR)
print("False Positive Rate :",FPR)
print("False Negative Rate :",FNR)
print("")
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
print("Positive Predictive Value :",PPV)
print("Negative predictive value :",NPV)

True Positive : 116
True Negative : 1406
False Positive : 42
False Negative : 108

True Positive Rate : 0.5178571428571429
True Negative Rate : 0.9709944751381215
False Positive Rate : 0.029005524861878452
False Negative Rate : 0.48214285714285715

Positive Predictive Value : 0.7341772151898734
Negative predictive value : 0.9286657859973579
