In [6]:
import warnings

import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from utils import preprocess_data, bayesian_optimisation

warnings.filterwarnings("ignore")

In [7]:
X_train, X_test, y_train, y_test, train_df, test_df = preprocess_data(standardise=False)

In [8]:
def optimise_decision_tree(criterion, splitter, max_depth, min_samples_split, min_samples_leaf):
    """
    Returns the accuracy based and default 5-fold cross validation score of decision trees of different parameters
    See https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html for more details.
    """
    criterion_map = {0: 'gini', 1: 'entropy'}
    splitter_map = {0: 'best', 1: 'random'}

    model = DecisionTreeClassifier(
        criterion=criterion_map[int(criterion)],
        splitter=splitter_map[int(splitter)],
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42
    )

    return cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()


In [9]:
param_space = np.array([
    (0, 1),  # criterion: 'gini' or 'entropy'
    (0, 1),  # splitter: 'best' or 'random'
    (1, 20),  # max_depth
    (2, 20),  # min_samples_split
    (1, 20)   # min_samples_leaf
])

n_iters = 25
initial_samples = 5

# Initial random samples
x0 = np.random.uniform(param_space[:, 0], param_space[:, 1], size=(initial_samples, param_space.shape[0]))
y0 = np.array([optimise_decision_tree(*params) for params in x0])

gp_params = {"alpha": 1e-6}

In [10]:
X_sample, Y_sample, gpr = bayesian_optimisation(n_iters, optimise_decision_tree, param_space, x0, y0.reshape(-1, 1), gp_params)

# Best parameters
best_idx = np.argmax(Y_sample)
best_params = X_sample[best_idx]
best_accuracy = Y_sample[best_idx]

print(f"Best accuracy: {best_accuracy}")

Best accuracy: [0.82275819]


In [11]:
best_model = DecisionTreeClassifier(
    criterion=['gini', 'entropy'][int(best_params[0])],
    splitter=['best', 'random'][int(best_params[1])],
    max_depth=int(best_params[2]),
    min_samples_split=int(best_params[3]),
    min_samples_leaf=int(best_params[4]),
    random_state=42
)
best_model.fit(X_train, y_train)

X_test_final = test_df.drop(columns=['Loan_ID'])

# Predict loan status using the trained Decision Tree model
y_test_pred = best_model.predict(X_test_final)
test_df['Loan_Status'] = ['Y' if pred == 1 else 'N' for pred in y_test_pred]
# Save the file for any future use as test datax
target_filename = "data/loan_sanction_test_with_predictions_decision_tree.csv"
test_df.to_csv(target_filename, index=False)
print(f"Predictions have been saved to {target_filename}.")

Predictions have been saved to data/loan_sanction_test_with_predictions_decision_tree.csv.


In [72]:
X_test_final = test_df.drop(columns=['Loan_ID'])
# Predict loan status using the trained Decision Tree model
y_test_pred = best_model.predict(X_test_final)
test_df['Loan_Status'] = ['Y' if pred == 1 else 'N' for pred in y_test_pred]
# Save the file
target_filename = "data/loan_sanction_test_with_predictions_decision_tree.csv"
test_df.to_csv(target_filename, index=False)
print(f"Predictions have been saved to {target_filename}.")

Best accuracy: [0.82073799]


Predictions have been saved to data/loan_sanction_test_with_predictions_decision_tree.csv.
