# Compare 14 Algorithms for Bank Note Forgery Detection
## *Using Cross validation*

![bank note](https://i.imgur.com/bB80wXX.png)

# Table of contents

[<h3>1. Load and visualize the data</h3>](#1)

[<h3>2. Data Preprocessing</h3>](#2)

[<h3>3. Comparison of 14 algorithms using cross validation</h3>](#3)

[<h3>4. Prediction metrics of the best model on the test set</h3>](#4)

# Load the libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LogisticRegression,PassiveAggressiveClassifier,RidgeClassifier,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import LinearSVC, SVC,NuSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from time import perf_counter
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))

import warnings
warnings.filterwarnings(action='ignore')

# 1. Load and visualize the data <a class="anchor" id="1"></a>

In [None]:
# Load the data
df = pd.read_csv('../input/banknote-authenticationcsv/BankNote_Authentication.csv')

# Shuffle the data
df.sample(frac=1)

# Display the first rows
df.head()

In [None]:
df.info()

In [None]:
# Create a column with the written class of the bank notes
df['Bank Note'] = df['class'].apply(lambda x: 'Real' if x == 0 else 'Fake')

In [None]:
sns.pairplot(df.drop('class', axis = 1), hue = 'Bank Note')
plt.show()

In [None]:
plt.figure(figsize = (9,9))
df['Bank Note'].value_counts().plot.pie()
plt.show()

# 2. Data Preprocessing<a class="anchor" id="2"></a><a class="anchor" id="2"></a>

In [None]:
X = df[['variance', 'skewness', 'curtosis', 'entropy']]
y = df['Bank Note']

In [None]:
# Scale the dataset
scaler = MinMaxScaler()
X_transf = scaler.fit_transform(X)

# Display the result of the transformation
pd.DataFrame(X_transf).head()

In [None]:
# Split into train and test set
# Note: There is a small data leakage for the year, because the dataset was transformed before
#       spliting it
X_train, X_test, y_train, y_test = train_test_split(X_transf, y, test_size=0.2, random_state=0)

# 3. Comparison of 14 algorithms using cross validation<a class="anchor" id="3"></a>

In [None]:
# Create a dictionary with the model which will be tested
models = {
    "GaussianNB":{"model":GaussianNB()},
    "PassiveAggressiveClassifier":{"model":PassiveAggressiveClassifier() },
    "RidgeClassifier":{"model":RidgeClassifier() },
    "SGDClassifier":{"model":SGDClassifier() },
    "KNeighborsClassifier":{"model":KNeighborsClassifier() },
    "DecisionTreeClassifier":{"model":DecisionTreeClassifier() },
    "ExtraTreeClassifier":{"model":ExtraTreeClassifier() },
    "LinearSVC":{"model":LinearSVC() },
    "SVC":{"model":SVC() },
    "NuSVC":{"model":NuSVC() },
    "MLPClassifier":{"model":MLPClassifier() },
    "RandomForestClassifier":{"model":RandomForestClassifier() },
    "GradientBoostingClassifier":{"model":GradientBoostingClassifier() },
    "AdaBoostClassifier":{"model":AdaBoostClassifier() }
}
# Use the 10-fold cross validation for each model
# to get the mean validation accuracy and the mean training time
for name, m in models.items():
    # Cross validation of the model
    model = m['model']
    result = cross_validate(model, X_train,y_train,cv = 10)
    
    # Mean accuracy and mean training time
    mean_val_accuracy = round( sum(result['test_score']) / len(result['test_score']), 4)
    mean_fit_time = round( sum(result['fit_time']) / len(result['fit_time']), 4)
    
    # Add the result to the dictionary witht he models
    m['val_accuracy'] = mean_val_accuracy
    m['Training time (sec)'] = mean_fit_time
    
    # Display the result
    print(f"{name:27} mean accuracy using 10-fold cross validation: {mean_val_accuracy*100:.2f}% - mean training time {mean_fit_time} sec")

In [None]:
# Create a DataFrame with the results
models_result = []

for name, v in models.items():
    lst = [name, v['val_accuracy'],v['Training time (sec)']]
    models_result.append(lst)

df_results = pd.DataFrame(models_result, 
                          columns = ['model','val_accuracy','Training time (sec)'])
df_results.sort_values(by='val_accuracy', ascending=False, inplace=True)
df_results.reset_index(inplace=True,drop=True)
df_results

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'val_accuracy', data = df_results)
plt.title('Mean Validation Accuracy for each Model\ny-axis between 0.8 and 1.0', fontsize = 15)
plt.ylim(0.8,1.005)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Accuracy',fontsize=15)
plt.xticks(rotation=90, fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'Training time (sec)', data = df_results)
plt.title('Training time for each Model in sec', fontsize = 15)
plt.xticks(rotation=90, fontsize=12)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Training time (sec)',fontsize=15)
plt.show()

# 4. Prediction metrics of the best model on the test set<a class="anchor" id="4"></a>

In [None]:
# Get the model with the highest mean validation accuracy
best_model = df_results.iloc[0]

# Fit the model
model = models[best_model[0]]['model']
model.fit(X_train,y_train)

# Predict the labels with the data set
pred = model.predict(X_test)

# Display the results
printmd(f'## Best Model: {best_model[0]} with {best_model[1]*100}% accuracy on the test set')
printmd(f'## Trained in: {best_model[2]} sec')

# Display a confusion matrix
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, pred, normalize='true')
plt.figure(figsize = (10,7))
sns.heatmap(cf_matrix, annot=True, xticklabels = sorted(set(y_test)), yticklabels = sorted(set(y_test)),cbar=False)
plt.title('Normalized Confusion Matrix', fontsize = 23)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()