## Task 1

In [None]:
# Importing Pandas and NumPy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Reading the txt file and putting it into 'df' object.
df = pd.read_csv('data_banknote_authentication.txt', header=None, names=["Variance", "Skewness", "Kurtosis", "Entropy", "Class"])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['Class'].value_counts()

In [None]:
df['Class'].value_counts().plot.bar()

In [None]:
sns.pairplot(df, hue = "Class")

In [None]:
import statsmodels.api as sm   

# UDF for calculating vif value
def vif_cal(input_data, dependent_col):
    vif_df = pd.DataFrame( columns = ['Var', 'Vif'])
    x_vars=input_data.drop([dependent_col], axis=1)
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.OLS(y,x).fit().rsquared 
        vif=round(1/(1-rsq),2)
        vif_df.loc[i] = [xvar_names[i], vif]
    return vif_df.sort_values(by = 'Vif', axis=0, ascending=False, inplace=False)

In [None]:
# Calculating Vif value
vif_cal(input_data=df, dependent_col="Class")

## Task 2

In [None]:
# Putting feature variable to X
X = df.drop('Class',axis=1)

# Putting response variable to y
y = df['Class']

In [None]:
# Feature normalization
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
X = pd.DataFrame(X)

In [None]:
X.head()

## Task 4

In [None]:
# Split the dataset into train (70%) and test (30%)
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 12)

In [None]:
df.shape

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

## Task 5

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix, ConfusionMatrixDisplay

# Create a model with default hyperparameters
clf_default = SGDClassifier(loss="log", random_state = 777)
clf_default.fit(X_train, y_train)

In [None]:
# Evaluation of model with default hyperparameters
y_train_predict_default = clf_default.predict(X_train)
y_test_predict_default = clf_default.predict(X_test)

print("Train dataset accuracy score using default parameters:", round(accuracy_score(y_train,y_train_predict_default),3))
print("Test dataset accuracy score using default parameters:", round(accuracy_score(y_test,y_test_predict_default),3))

print("\nClassification report for test dataset:\n", classification_report(y_test,y_test_predict_default))

In [None]:
# Confusion matrix

# Train dataset
cmd_obj_train_default = ConfusionMatrixDisplay(confusion_matrix(y_train,y_train_predict_default), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_train_default.plot(cmap="Blues")
cmd_obj_train_default.ax_.set(
                title="Train dataset confusion_matrix using default parameters\n", 
                xlabel="Predicted class", 
                ylabel="Actual class")

# Test dataset
cmd_obj_test_default = ConfusionMatrixDisplay(confusion_matrix(y_test,y_test_predict_default), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_test_default.plot(cmap="Blues")
cmd_obj_test_default.ax_.set(
                title="Test dataset confusion_matrix using default parameters\n", 
                xlabel="Predicted class", 
                ylabel="Actual class")

plt.show()

In [None]:
# GridSearchCV to find optimal 'learning rate'
from sklearn.model_selection import GridSearchCV

# parameters to build the model on
lr_parameters = {'learning_rate': ['constant','optimal','invscaling','adaptive']}  # default='optimal'

# instantiate the model
clf_lr = SGDClassifier(loss="log", random_state = 777, eta0 = 0.001)  # eta0 is the initial learning rate for the ‘constant’, ‘invscaling’ or ‘adaptive’ schedules

# fit clf on training data
clf_lr = GridSearchCV(estimator = clf_lr, param_grid = lr_parameters,
                   scoring="accuracy", return_train_score=True)
clf_lr.fit(X_train, y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',clf_lr.best_score_,'using',clf_lr.best_params_)

In [None]:
# plotting accuracies with learning rate

lr_scores = clf_lr.cv_results_

plotdata = pd.DataFrame({
    "training accuracy": lr_scores["mean_train_score"],
    "test accuracy": lr_scores["mean_test_score"]},
    index=lr_scores["param_learning_rate"])

plotdata.plot(kind="bar",figsize=(8, 5))

plt.xlabel("Learning Rate")
plt.ylabel("Accuracy")

plt.show()

In [None]:
# GridSearchCV to find optimal 'alpha', 'max_iter' and 'tol' using 'learning_rate' = optimal
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'alpha': np.arange(0.00001, 0.0002, 0.00005),  # default = 0.0001 np.arange(0.000001,0.0002,0.000005)
    'max_iter': range(100, 2000, 100),  # default = 1000
    'tol': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],  #  default = 1e-3
}

# Create a base model
clf_base = SGDClassifier(loss="log", learning_rate = "optimal", random_state = 777)

# Instantiate the grid search model
clf_base = GridSearchCV(estimator = clf_base, param_grid = param_grid, scoring="accuracy", error_score='raise')

In [None]:
# Fit the grid search to the data
clf_base.fit(X_train, y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',clf_base.best_score_,'using',clf_base.best_params_)

In [None]:
# Building the model with optimized hyperparameters obtained above
clf_best = SGDClassifier(loss="log",
                         alpha = 1e-05,
                         learning_rate = 'optimal',
                         tol = 0.001, 
                         max_iter = 100,
                         random_state = 777)

clf_best.fit(X_train, y_train)

In [None]:
# Evaluation of model with optimized hyperparameters
y_train_predict_best = clf_best.predict(X_train)
y_test_predict_best = clf_best.predict(X_test)

print("Train dataset accuracy score using optimized parameters:", round(accuracy_score(y_train,y_train_predict_best),3))
print("Test dataset accuracy score using optimized parameters:", round(accuracy_score(y_test,y_test_predict_best),3))

In [None]:
# Confusion matrix 

# Train dataset
cmd_obj_train_best = ConfusionMatrixDisplay(confusion_matrix(y_train,y_train_predict_best), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_train_best.plot(cmap="Blues")
cmd_obj_train_best.ax_.set(
                title="Train dataset confusion_matrix using optimized parameters\n", 
                xlabel="Predicted class", 
                ylabel="Actual class")

# Test dataset
cmd_obj_test_best = ConfusionMatrixDisplay(confusion_matrix(y_test,y_test_predict_best), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_test_best.plot(cmap="Blues")
cmd_obj_test_best.ax_.set(
                title="Test dataset confusion_matrix using optimized parameters\n", 
                xlabel="Predicted class", 
                ylabel="Actual class")

plt.show()

## Task 6

In [None]:
# Classification report for model using optimized hyperparameters

print("Classification report for test dataset:\n", classification_report(y_test,y_test_predict_best))

## Task 7

In [None]:
# Building the model with optimized hyperparameters and penalty = 'l2'
# Note: default hyperparameter is 'l2'. Therefore the following results will be same as above
clf_l2 = SGDClassifier(loss="log",
                       alpha = 1e-05,
                       learning_rate = 'optimal',
                       tol = 0.001,
                       max_iter = 100,
                       random_state = 777,
                       penalty = 'l2')

clf_l2.fit(X_train, y_train)

In [None]:
# Evaluation of model with optimized hyperparameters and penalty = 'l2'
y_train_predict_l2 = clf_l2.predict(X_train)
y_test_predict_l2 = clf_l2.predict(X_test)

print("Train dataset accuracy score using penalty = 'l2':", round(accuracy_score(y_train,y_train_predict_l2),3))
print("Test dataset accuracy score using penalty = 'l2':", round(accuracy_score(y_test,y_test_predict_l2),3))

In [None]:
# Confusion matrix 

# Train dataset
cmd_obj_train_best = ConfusionMatrixDisplay(confusion_matrix(y_train,y_train_predict_l2), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_train_best.plot(cmap="Blues")
cmd_obj_train_best.ax_.set(
                title="Train dataset confusion_matrix using penalty = 'l2'", 
                xlabel="Predicted class", 
                ylabel="Actual class")

# Test dataset
cmd_obj_test_best = ConfusionMatrixDisplay(confusion_matrix(y_test,y_test_predict_l2), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_test_best.plot(cmap="Blues")
cmd_obj_test_best.ax_.set(
                title="Test dataset confusion_matrix using penalty = 'l2'", 
                xlabel="Predicted class", 
                ylabel="Actual class")

plt.show()

In [None]:
# Building the model with optimized hyperparameters and penalty = 'l1'
clf_l1 = SGDClassifier(loss="log",
                         alpha = 1e-05,
                         learning_rate = 'optimal',
                         tol = 0.001, 
                         max_iter = 100,
                         penalty = 'l1',
                         random_state = 777)

clf_l1.fit(X_train, y_train)

In [None]:
# Evaluation of model with optimized hyperparameters and penalty = 'l1'
y_train_predict_l1 = clf_l1.predict(X_train)
y_test_predict_l1 = clf_l1.predict(X_test)

print("Train dataset accuracy score using penalty = 'l1':", round(accuracy_score(y_train,y_train_predict_l1),3))
print("Test dataset accuracy score using penalty = 'l1':", round(accuracy_score(y_test,y_test_predict_l1),3))

In [None]:
# Confusion matrix 

# Train dataset
cmd_obj_train_best = ConfusionMatrixDisplay(confusion_matrix(y_train,y_train_predict_l1), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_train_best.plot(cmap="Blues")
cmd_obj_train_best.ax_.set(
                title="Train dataset confusion_matrix using penalty = 'l1'", 
                xlabel="Predicted class", 
                ylabel="Actual class")

# Test dataset
cmd_obj_test_best = ConfusionMatrixDisplay(confusion_matrix(y_test,y_test_predict_l1), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_test_best.plot(cmap="Blues")
cmd_obj_test_best.ax_.set(
                title="Test dataset confusion_matrix using penalty = 'l1'", 
                xlabel="Predicted class", 
                ylabel="Actual class")

plt.show()

In [None]:
# Building the model with optimized hyperparameters and penalty = 'elasticnet'
clf_elasticnet = SGDClassifier(loss="log",
                         alpha = 1e-05,
                         learning_rate = 'optimal',
                         tol = 0.001, 
                         max_iter = 100,
                         penalty = 'elasticnet',
                         random_state = 777)

clf_elasticnet.fit(X_train, y_train)

In [None]:
# Evaluation of model with optimized hyperparameters and penalty = 'elasticnet'
y_train_predict_elasticnet = clf_elasticnet.predict(X_train)
y_test_predict_elasticnet = clf_elasticnet.predict(X_test)

print("Train dataset accuracy score using penalty = 'elasticnet':", round(accuracy_score(y_train,y_train_predict_elasticnet),3))
print("Test dataset accuracy score using penalty = 'elasticnet':", round(accuracy_score(y_test,y_test_predict_elasticnet),3))

In [None]:
# Confusion matrix 

# Train dataset
cmd_obj_train_best = ConfusionMatrixDisplay(confusion_matrix(y_train,y_train_predict_elasticnet), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_train_best.plot(cmap="Blues")
cmd_obj_train_best.ax_.set(
                title="Train dataset confusion_matrix using penalty = 'elasticnet'", 
                xlabel="Predicted class", 
                ylabel="Actual class")

# Test dataset
cmd_obj_test_best = ConfusionMatrixDisplay(confusion_matrix(y_test,y_test_predict_elasticnet), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_test_best.plot(cmap="Blues")
cmd_obj_test_best.ax_.set(
                title="Test dataset confusion_matrix using penalty = 'elasticnet'", 
                xlabel="Predicted class", 
                ylabel="Actual class")

plt.show()

## Task 8

In [None]:
# Classification report for test dataset using penalty = 'l2'

print("Classification report for test dataset:\n", classification_report(y_test,y_test_predict_l2))

In [None]:
# Classification report for test dataset using penalty = 'l1'

print("Classification report for test dataset:\n", classification_report(y_test,y_test_predict_l1))

In [None]:
# Classification report for test dataset using penalty = 'elasticnet'

print("Classification report for test dataset:\n", classification_report(y_test,y_test_predict_elasticnet))

## Task 9

In [None]:
# Building KNN model with default hyperparameters
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()  

In [None]:
knn.fit(X_train, y_train)

In [None]:
# Evaluation of KNN model
y_train_predict_knn = knn.predict(X_train)
y_test_predict_knn = knn.predict(X_test)

print("Train dataset accuracy score using KNN model:", round(accuracy_score(y_train,y_train_predict_knn),3))
print("Test dataset accuracy score using KNN model:", round(accuracy_score(y_test,y_test_predict_knn),3))

In [None]:
# Confusion matrix of KNN model

# Train dataset
cmd_obj_train_best = ConfusionMatrixDisplay(confusion_matrix(y_train,y_train_predict_knn), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_train_best.plot(cmap="Blues")
cmd_obj_train_best.ax_.set(
                title="Train dataset confusion_matrix using KNN", 
                xlabel="Predicted class", 
                ylabel="Actual class")

# Test dataset
cmd_obj_test_best = ConfusionMatrixDisplay(confusion_matrix(y_test,y_test_predict_knn), display_labels=['0\n(Genuine)', '1\n(Forged)'])
cmd_obj_test_best.plot(cmap="Blues")
cmd_obj_test_best.ax_.set(
                title="Test dataset confusion_matrix using KNN", 
                xlabel="Predicted class", 
                ylabel="Actual class")

plt.show()

In [None]:
# Classification report for KNN model

print("Classification report for test dataset:\n", classification_report(y_test,y_test_predict_knn))