In [9]:
import warnings

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from utils import preprocess_data, bayesian_optimisation, write_new_data_file

warnings.filterwarnings("ignore")

In [2]:
X_train, X_test, y_train, y_test, train_df, test_df = preprocess_data(standardise=False)

Optimise the hyperparameters of `Decision Tree` by using `cross-validation` to evaluate different configurations

In [3]:
def optimise_decision_tree(criterion, splitter, max_depth, min_samples_split, min_samples_leaf):
    """
    Returns the accuracy based and default 5-fold cross validation score of decision trees of different parameters
    See https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html for more details.
    """
    criterion_map = {0: 'gini', 1: 'entropy'}
    splitter_map = {0: 'best', 1: 'random'}

    model = DecisionTreeClassifier(
        criterion=criterion_map[int(criterion)],
        splitter=splitter_map[int(splitter)],
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42
    )

    return cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()


In [4]:
param_space = np.array([
    (0, 1),  # criterion: 'gini' or 'entropy'
    (0, 1),  # splitter: 'best' or 'random'
    (1, 20),  # max_depth
    (2, 20),  # min_samples_split
    (1, 20)   # min_samples_leaf
])

n_iters = 25
initial_samples = 5

# Initial random samples
x0 = np.random.uniform(param_space[:, 0], param_space[:, 1], size=(initial_samples, param_space.shape[0]))
y0 = np.array([optimise_decision_tree(*params) for params in x0])

gp_params = {"alpha": 1e-6}

Carry out the actual `Bayesian Optimisation`

In [5]:
X_sample, Y_sample, gpr = bayesian_optimisation(n_iters, optimise_decision_tree, param_space, x0, y0.reshape(-1, 1), gp_params)

# Best parameters
best_idx = np.argmax(Y_sample)
best_params = X_sample[best_idx]
best_accuracy = Y_sample[best_idx]

print(f"Best accuracy: {best_accuracy}")

Best accuracy: [0.82275819]


Create the model with the optimised hyperparameters and save the predictions to the test data to be validated with the rest of the models in the project


In [6]:
best_model = DecisionTreeClassifier(
    criterion=['gini', 'entropy'][int(best_params[0])],
    splitter=['best', 'random'][int(best_params[1])],
    max_depth=int(best_params[2]),
    min_samples_split=int(best_params[3]),
    min_samples_leaf=int(best_params[4]),
    random_state=42
)
# Save the file for any future use as test datax
target_filename = "data/loan_sanction_test_with_predictions_decision_tree.csv"
write_new_data_file(best_model, X_train, y_train, test_df, target_filename)

Predictions have been saved to data/loan_sanction_test_with_predictions_decision_tree.csv.


Cross-evaluate the model against the validation data file created using other models in this project

In [13]:
for algo, filename in {
    "CNN" : 'data/loan_sanction_test_with_predictions_cnn.csv',
    "KNN": 'data/loan_sanction_test_with_predictions_knn.csv',
    "LeNet5": 'data/loan_sanction_test_with_predictions_lenet5.csv',
    "LR": 'data/loan_sanction_test_with_predictions_lr.csv',

}.items():
    test_df_new = pd.read_csv(filename)
    X_new = test_df_new.drop(columns=['Loan_ID', 'Loan_Status'])
    y_new = test_df_new['Loan_Status']
    y_pred = best_model.predict(X_new)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels

    lr_accuracy = accuracy_score(y_new, y_pred)
    lr_report = classification_report(y_new, y_pred)
    print(f'DT Performance for {algo} produced predictions {lr_accuracy}')

DT Performance for CNN produced predictions 0.2125340599455041
DT Performance for KNN produced predictions 0.4904632152588556
DT Performance for LeNet5 produced predictions 0.9482288828337875
DT Performance for LR produced predictions 0.9482288828337875
