# Bank Note Authentication UCI data

- Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.

- Dataset can be used for Binary Classification sample problems

- Identify if bank note is authentic or not

# 1. Import the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.style.use('seaborn')

# 2. Exploring the Data

In [None]:
df = pd.read_csv('../input/bank-note-authentication-uci-data/BankNote_Authentication.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

- No missing values seen in the data

In [None]:
df.describe()

In [None]:
print(f'Number of duplicate entries = {df[df.duplicated()].size}')

In [None]:
#The Duplicate Entries are
df[df.duplicated()]

In [None]:
#We Drop the duplicate values
df = df.drop_duplicates(keep='first')
df.shape

In [None]:
print(f'Number of data samples: {df.shape[0]}')
print(f'Number of features: {df.shape[1] - 1}')

In [None]:
print(f'Number of samples of Class 0: {df["class"].value_counts()[0]}')
print(f'Number of features of Class 1: {df["class"].value_counts()[1]}')

In [None]:
plt.bar(df['class'].unique(),df['class'].value_counts(), width=0.5)
plt.title('Target Value Distribution')
plt.xlabel('Target Class')
plt.ylabel('Counts for each class')
plt.xticks([0,1])
plt.show()

In [None]:
df.describe()

In [None]:
x = sns.pairplot(df, hue='class')

In [None]:
df.hist(bins=20,figsize=(11,9),layout=(2,3))

# 3. Train-Test Splitting

In [None]:
X = df.drop(labels=['class'],axis=1)

In [None]:
y = df['class']
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
print(f'X_train shape is {X_train.shape}')
print(f'y_train shape is {y_train.shape}')

print(f'X_test shape is {X_test.shape}')
print(f'X_test shape is {y_test.shape}')

In [None]:
fig,ax = plt.subplots(1,2,sharey=True,sharex=True,figsize=(10,5))

ax[0].bar(y_train.unique(),y_train.value_counts(), edgecolor='black')
ax[0].set_title('Training Class Labels Distribution')
ax[0].set_xticks(ticks=[0,1])
ax[0].set_xticklabels(labels=["Class 0","Class 1"])
ax[0].set_ylabel('Counts')

ax[1].bar(y_test.unique(),y_test.value_counts(), edgecolor='black',color='yellow')
ax[1].set_title('Test Class Labels Distribution')
ax[1].set_xticks(ticks=[0,1])
ax[1].set_xticklabels(labels=["Class 0","Class 1"])

# 4. Scaling the Data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
#! Remember to scale on X_train that is based on mean and std of training data
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Machine Learning Models

## 5.1.1 K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def knn_get_training_testing_scores(neighbour_counts):
    
    training_scores = []
    test_scores = []

    for k_neighbours in neighbour_counts:
        knn_clf = KNeighborsClassifier(n_neighbors=k_neighbours)
        knn_clf.fit(X_train_scaled,y_train)

        training_scores.append(knn_clf.score(X_train_scaled, y_train))
        test_scores.append(knn_clf.score(X_test_scaled, y_test))

    return training_scores, test_scores

In [None]:
neighbour_count_parameter = [i for i in range(1,11,1)]
training_scores, test_scores = knn_get_training_testing_scores(neighbour_count_parameter)

In [None]:
plt.plot(neighbour_count_parameter,test_scores,label='Training Score')
plt.plot(neighbour_count_parameter,training_scores, label='Test Score')

plt.xticks(neighbour_count_parameter)
plt.title('Number of Neighbours vs Train-Test Scores')
plt.xlabel('Number of Neighbours')
plt.ylabel('Accuracy Scores')
plt.legend()
plt.show()

###### We find that k=2 has a good generalization

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=2)
knn_clf.fit(X_train_scaled,y_train)

In [None]:
knn_y_hat = knn_clf.predict(X_test_scaled)
#knn_y_hat

In [None]:
print(f'Training Set accuracy score : {knn_clf.score(X_train_scaled, y_train)}')
print(f'Test Set accuracy score : {knn_clf.score(X_test_scaled, y_test)}')

## 5.1.2 Perfomace Measures

In [None]:
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import classification_report

In [None]:
knn_confusion_matrix = confusion_matrix(y_test,knn_y_hat)

In [None]:
fig,ax = plt.subplots()
sns.heatmap(knn_confusion_matrix, annot=True, square=True)

ax.set_xlabel('Predicted Class')
ax.set_ylabel('Actual Class')

labels = ['Class 0', 'Class 1']
ax.set_xticklabels(labels,ha='center', minor=False)
ax.set_yticklabels(labels,ha='center', minor=False)

title_string = f'Accuracy Score: {round(knn_clf.score(X_test_scaled, y_test),3)}'
ax.set_title(title_string, size = 13)

In [None]:
score_report = classification_report(y_test, knn_y_hat, target_names=["Class 0","Class 1"])
print(score_report)

## 5.2.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def get_logistic_regression_train_test_scores(c_parameters):
    
    training_scores = []
    testing_scores = []
    
    for c in c_parameters:
        model = LogisticRegression(C=c)
        model.fit(X_train_scaled, y_train)
        training_scores.append(model.score(X_train_scaled,y_train))
        testing_scores.append(model.score(X_test_scaled,y_test))

    return training_scores,testing_scores

In [None]:
c_parameters = [0.01, 0.1, 1, 10, 100, 1000]
lr_train_scores, lr_test_scores = get_logistic_regression_train_test_scores(c_parameters)

print(f'Training Score, Test Scores')
for train_score,test_score in zip(lr_train_scores, lr_test_scores):
    print((train_score,test_score))

In [None]:
plt.figure(figsize=(8,6))

plt.plot(lr_train_scores, label='Training Score',c='b',marker='o')
plt.plot(lr_test_scores, label='Test Score',c='g',marker='o')

plt.xticks(ticks=range(len(c_parameters)),labels=['0.01', '0.1', '1', '10', '100', '1000'])

plt.xlabel('Coefficient of Regularization (C)')
plt.ylabel('Accuracy Score')

plt.legend(loc='lower right')
plt.show()

##### We find that for C = 1, 10 the model has good scores

- Smaller values of C implies lots of regularization, making the model simpler and reducing chance of overfitting.
- As C increases the effect of regularization decreases, thereby making the model more complex.
- Very high values of C results in little to no regularization, which results to the model overfitting.

In [None]:
logit_reg = LogisticRegression(C=10)
logit_reg.fit(X_train_scaled, y_train)

In [None]:
logit_reg_y_hat = logit_reg.predict(X_test_scaled)
#print(logit_reg_y_hat)

In [None]:
print(f'Training Set accuracy score : {logit_reg.score(X_train_scaled, y_train)}')
print(f'Test Set accuracy score     : {logit_reg.score(X_test_scaled, y_test)}')

In [None]:
print(f"Features  : {list(df.columns[:-1])}" )
print(f"Weights   : {list(np.round(logit_reg.coef_[0],3))}")
print(f"Intercept : {np.round(logit_reg.intercept_,2)}")

## 5.2.2 Performace Measure

In [None]:
lr_confusion_matrix = confusion_matrix(y_test,logit_reg_y_hat)

In [None]:
fig,ax = plt.subplots()
sns.heatmap(lr_confusion_matrix, annot=True, square=True)

ax.set_xlabel('Predicted Class')
ax.set_ylabel('Actual Class')

labels = ['Class 0', 'Class 1']
ax.set_xticklabels(labels,ha='center', minor=False)
ax.set_yticklabels(labels,ha='center', minor=False)

title_string = f'Accuracy Score: {round(logit_reg.score(X_test_scaled, y_test),3)}'
ax.set_title(title_string, size = 13)

In [None]:
score_report = classification_report(y_test, logit_reg_y_hat, target_names=["Class 0","Class 1"])
print(score_report)