In [1]:
import warnings

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from utils import preprocess_data, bayesian_optimisation, write_new_data_file

In [2]:
warnings.filterwarnings("ignore")

X_train, X_test, y_train, y_test, train_df, test_df = preprocess_data(standardise=False)

In [3]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_report = classification_report(y_test, y_pred_lr)
print(lr_report)
print(lr_accuracy)

              precision    recall  f1-score   support

           0       0.90      0.42      0.57        43
           1       0.76      0.97      0.85        80

    accuracy                           0.78       123
   macro avg       0.83      0.70      0.71       123
weighted avg       0.81      0.78      0.75       123

0.7804878048780488


Optimise the hyperparameters of LR model by using `cross-validation` to evaluate different configurations

In [4]:
def optimise_logistic_regression(C, max_iter):
    model = LogisticRegression(
        C=C,
        max_iter=int(max_iter),
        random_state=42,
        solver='liblinear'  # You can change the solver if needed
    )
    return cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()


In [5]:
param_space = np.array([
    (0.01, 10),  # Regularization strength
    (100, 1000)  # Number of iterations
])

n_iters = 25
initial_samples = 5

# Initial random samples
x0 = np.random.uniform(param_space[:, 0], param_space[:, 1], size=(initial_samples, param_space.shape[0]))
y0 = np.array([optimise_logistic_regression(*params) for params in x0])

gp_params = {"alpha": 1e-6}


In [6]:
X_sample, Y_sample, gpr = bayesian_optimisation(n_iters, optimise_logistic_regression, param_space, x0, y0.reshape(-1, 1), gp_params)

# Best parameters
best_idx = np.argmax(Y_sample)
best_params = X_sample[best_idx]
best_accuracy = Y_sample[best_idx]

print(f"Best accuracy: {best_accuracy}")

Best accuracy: [0.81255411]


Use the best model to create a validation set to be used for the other models in the project

In [7]:
best_model = LogisticRegression(
    C=best_params[0],
    max_iter=int(best_params[1]),
    random_state=42,
    solver='liblinear'  # You can change the solver if needed
)

target_filename = "data/loan_sanction_test_with_predictions_lr.csv"
write_new_data_file(best_model, X_train, y_train, test_df, target_filename)

Predictions have been saved to data/loan_sanction_test_with_predictions_lr.csv.


Validate this model against the data produced by the other models. 
## It can be seen the the data LeNet5 created fares well with LR

In [9]:
import pandas as pd

for algo, filename in {
    "CNN" : 'data/loan_sanction_test_with_predictions_cnn.csv',
    "DT": 'data/loan_sanction_test_with_predictions_decision_tree.csv',
    "KNN": 'data/loan_sanction_test_with_predictions_knn.csv',
    "LeNet5": 'data/loan_sanction_test_with_predictions_lenet5.csv',
}.items():
    test_df_new = pd.read_csv(filename)
    X_new = test_df_new.drop(columns=['Loan_ID', 'Loan_Status'])
    y_new = test_df_new['Loan_Status']

    y_pred = best_model.predict(X_new)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels

    lr_accuracy = accuracy_score(y_new, y_pred)
    lr_report = classification_report(y_new, y_pred)
    print(f'LR performance for {algo} produced predictions {lr_accuracy}')
    

LR performance for CNN produced predictions 0.16076294277929154
LR performance for DT produced predictions 0.9482288828337875
LR performance for KNN produced predictions 0.4713896457765668
LR performance for LeNet5 produced predictions 1.0
