In [19]:
import warnings

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

from utils import preprocess_data, bayesian_optimisation, write_new_data_file

In [11]:
warnings.filterwarnings("ignore")

X_train, X_test, y_train, y_test, train_df, test_df = preprocess_data(standardise=True)

In [12]:
def optimise_knn(n_neighbors, leaf_size, p):
    model = KNeighborsClassifier(
        n_neighbors=int(n_neighbors),
        leaf_size=int(leaf_size),
        p=int(p)
    )
    return cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()


In [13]:
param_space = np.array([
    (1, 30),    # 'n_neighbors'
    (20, 40),   # 'leaf_size'
    (1, 2)      # Minkowski metric parameter (1 for Manhattan, 2 for Euclidean)
])

n_iters = 25
initial_samples = 5

# Initial random samples
x0 = np.random.uniform(param_space[:, 0], param_space[:, 1], size=(initial_samples, param_space.shape[0]))
y0 = np.array([optimise_knn(*params) for params in x0])

gp_params = {"alpha": 1e-6}

In [14]:
X_sample, Y_sample, gpr = bayesian_optimisation(n_iters, optimise_knn, param_space, x0, y0.reshape(-1, 1), gp_params)

# Best parameters
best_idx = np.argmax(Y_sample)
best_params = X_sample[best_idx]
best_accuracy = Y_sample[best_idx]

print(f"Best accuracy: {best_accuracy}")


Best accuracy: [0.80239126]


In [15]:
model = KNeighborsClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized KNN Accuracy: {accuracy}")

Optimized KNN Accuracy: 0.7560975609756098


In [16]:
best_model = KNeighborsClassifier(
    n_neighbors=int(best_params[0]),
    leaf_size=int(best_params[1]),
    p=int(best_params[2])
)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels

accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized CNN Accuracy: {accuracy}")

Optimized CNN Accuracy: 0.7886178861788617


In [17]:
# Save the file for any future use as test datax
target_filename = "data/loan_sanction_test_with_predictions_knn.csv"
write_new_data_file(best_model, X_train, y_train, test_df, target_filename)

Predictions have been saved to data/loan_sanction_test_with_predictions_knn.csv.


In [20]:
from sklearn.preprocessing import StandardScaler

for algo, filename in {
    "CNN" : 'data/loan_sanction_test_with_predictions_cnn.csv',
    "DT": 'data/loan_sanction_test_with_predictions_decision_tree.csv',
    "LeNet5": 'data/loan_sanction_test_with_predictions_lenet5.csv',
    "LR": 'data/loan_sanction_test_with_predictions_lr.csv',

}.items():
    test_df_new = pd.read_csv(filename)
    X_new = test_df_new.drop(columns=['Loan_ID', 'Loan_Status'])
    y_new = test_df_new['Loan_Status']
    # Only scale for those that were scaled
    scaler = StandardScaler()
    X_new = scaler.fit_transform(X_new)
    
    y_pred = best_model.predict(X_new)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels

    lr_accuracy = accuracy_score(y_new, y_pred)
    lr_report = classification_report(y_new, y_pred)
    print(f'KNN Performance for {algo} produced predictions {lr_accuracy}')

KNN Performance for CNN produced predictions 0.16621253405994552
KNN Performance for DT produced predictions 0.9536784741144414
KNN Performance for LeNet5 produced predictions 0.989100817438692
KNN Performance for LR produced predictions 0.989100817438692
