## Import Libraries

In [None]:
# Basics
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
import missingno as msno
from sklearn.preprocessing import StandardScaler, MinMaxScaler, binarize

# Model Selection
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier

# Metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, accuracy_score

# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2

# Warnings
import warnings as ws
ws.filterwarnings('ignore')

In [None]:
# Load Dataset
data = pd.read_csv("/kaggle/input/bank-note-authentication-uci-data/BankNote_Authentication.csv")
data.head()

In [None]:
# Summary
def summary(data):
    df = {
     'Count' : data.shape[0],
     'NA values' : data.isna().sum(),
     '% NA' : round((data.isna().sum()/data.shape[0]) * 100, 2),
     'Unique' : data.nunique(),
     'Dtype' : data.dtypes,
     'min' : round(data.min(),2),
     '25%' : round(data.quantile(.25),2),
     '50%' : round(data.quantile(.50),2),
     'mean' : round(data.mean(),2),
     '75%' : round(data.quantile(.75),2),   
     'max' : round(data.max(),2)
    } 
    return(pd.DataFrame(df))

print('Shape is :', data.shape)
summary(data)

There is no missing value in this dataset

In [None]:
data.hist(figsize = (10,10))
plt.show()

In [None]:
col_names = data.drop('class', axis = 1).columns.tolist()

plt.figure(figsize = (10,3))
i = 0
for col in col_names:
    plt.subplot(1,4,i+1)
    plt.grid(True, alpha =0.5)
    sns.kdeplot(data[col][data['class'] ==0], label = 'Fake note')
    sns.kdeplot(data[col][data['class'] ==1], label = 'Original note')
    plt.title('Class vs ' + col)
    plt.tight_layout()
    i+=1
plt.show()

### Split Dataset

In [None]:
X = data.drop('class', axis = 1)
Y = data['class']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 7)

In [None]:
# Model
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('GB', GradientBoostingClassifier()))

In [None]:
 def model_selection(x_train, y_train):
    acc_result = []
    auc_result = []
    names = []

    col = ['Model', 'ROC AUC Mean','ROC AUC Std','ACC Mean', 'ACC Std']
    result = pd.DataFrame(columns = col)

    i=0
    for name, model in models:
        kfold = KFold(n_splits = 10, random_state = 7)
        cv_acc_result  = cross_val_score(model, x_train, y_train, cv = kfold, scoring = 'accuracy')
        cv_auc_result  = cross_val_score(model, x_train, y_train, cv = kfold, scoring = 'roc_auc')

        acc_result.append(cv_acc_result)
        auc_result.append(cv_auc_result)
        names.append(name)

        result.loc[i] = [name, 
                         cv_auc_result.mean(), 
                         cv_auc_result.std(),
                         cv_acc_result.mean(),
                         cv_acc_result.std()]

        result = result.sort_values('ROC AUC Mean', ascending = False)
        i+= 1

    plt.figure(figsize = (10,5))
    plt.subplot(1,2,1)
    sns.boxplot(x = names, y = auc_result)
    plt.title('ROC AUC Score')

    plt.subplot(1,2,2)
    sns.boxplot(x = names, y = acc_result)
    plt.title('Accuracy Score')
    plt.show()

    return(result)

In [None]:
model_selection(x_train, y_train)

#### KNN perform well for this dataset 

In [None]:
def model_validation(model,x_test,y_test,thr = 0.5) :
    
    y_pred_prob = model.predict_proba(x_test)[:,1]
    y_pred = binarize(y_pred_prob.reshape(1,-1), thr)[0]
    
    cnf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize = (10,3))
    plt.subplot(1,2,1)
    sns.heatmap(cnf_matrix, annot = True, fmt = 'g')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted label')
    plt.ylabel('Actual label')

    fpr, tpr, threshold = roc_curve(y_test, y_pred_prob)
    plt.subplot(1,2,2)
    sns.lineplot(fpr, tpr)
    plt.plot([0,1],[0,1], 'r--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

    
    print('Classification Report :')
    print('===' * 20)
    print(classification_report(y_test, y_pred))

    score = tpr - fpr
    opt_threshold = sorted(zip(score,threshold))[-1][1]
    print('='*20)
    print('Area Under Curve', roc_auc_score(y_test,y_pred))
    print('Accuracy', accuracy_score(y_test,y_pred))
    print('Optimal Threshold : ',opt_threshold)
    print('='*20)

In [None]:
KNeighborsClassifier()

In [None]:
param_grid = {
    'leaf_size' : [2,5,7,9,11],
    'n_neighbors' : [2,5,7,9,11],
    'p' : [1,2]    
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid = param_grid)
grid.fit(x_train, y_train)

In [None]:
grid.best_params_

In [None]:
final_model = grid.best_estimator_

In [None]:
model_validation(final_model, x_test, y_test)