In [2]:
import warnings

import numpy as np
import pandas as pd
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

from utils import preprocess_data, bayesian_optimisation, write_new_data_file

warnings.filterwarnings("ignore")

2024-08-05 15:09:35.346219: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
X_train, X_test, y_train, y_test, train_df, test_df = preprocess_data(standardise=True)

In [11]:
def create_cnn_model(learning_rate, dropout_rate, num_filters, kernel_size):
    model = Sequential()
    model.add(Conv1D(filters=int(num_filters), kernel_size=int(kernel_size), activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model


In [12]:
def optimise_cnn(learning_rate, dropout_rate, num_filters, kernel_size):
    model = KerasClassifier(build_fn=create_cnn_model,
                            learning_rate=learning_rate,
                            dropout_rate=dropout_rate,
                            num_filters=num_filters,
                            kernel_size=kernel_size,
                            epochs=10,
                            batch_size=10,
                            verbose=0)

    return cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

In [13]:
param_space = np.array([
    (0.0001, 0.01),     # learning_rate
    (0.1, 0.5),         # dropout_rate
    (10, 50),           # num_filters
    (2, 5)              # kernel_size
])
n_iters = 25
initial_samples = 5

# Initial random samples
x0 = np.random.uniform(param_space[:, 0], param_space[:, 1], size=(initial_samples, param_space.shape[0]))
y0 = np.array([optimise_cnn(*params) for params in x0])

gp_params = {"alpha": 1e-6}

In [14]:
X_sample, Y_sample, gpr = bayesian_optimisation(n_iters, optimise_cnn, param_space, x0, y0.reshape(-1, 1), gp_params)

# Best parameters
best_idx = np.argmax(Y_sample)
best_params = X_sample[best_idx]
best_accuracy = Y_sample[best_idx]

print(f"Best accuracy: {best_accuracy}")

Best accuracy: [0.82076413]


In [15]:
best_model = create_cnn_model(
    learning_rate=best_params[0],
    dropout_rate=best_params[1],
    num_filters=best_params[2],
    kernel_size=best_params[3]
)

best_model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=0)
y_pred = best_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels

accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized CNN Accuracy: {accuracy}")
X_test.shape

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Optimized CNN Accuracy: 0.7642276422764228


(123, 13)

In [16]:
target_filename = "data/loan_sanction_test_with_predictions_cnn.csv"
write_new_data_file(best_model, X_train, y_train, test_df, target_filename)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8465 - loss: 0.3862 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 711us/step
Predictions have been saved to data/loan_sanction_test_with_predictions_cnn.csv.


In [28]:
for algo, filename in {
    "DT" : 'data/loan_sanction_test_with_predictions_decision_tree.csv',
    "KNN": 'data/loan_sanction_test_with_predictions_knn.csv',
    "LeNet5": 'data/loan_sanction_test_with_predictions_lenet5.csv',
    "LR": 'data/loan_sanction_test_with_predictions_lr.csv',
    
}.items():
    test_df_new = pd.read_csv(filename)
    X_new = test_df_new.drop(columns=['Loan_ID', 'Loan_Status'])
    y_new = test_df_new['Loan_Status']
    
    # Only scale for those that were scaled
    scaler = StandardScaler()
    X_new = scaler.fit_transform(X_new)
    
    y_pred = best_model.predict(X_new)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to class labels

    lr_accuracy = accuracy_score(y_new, y_pred)
    lr_report = classification_report(y_new, y_pred)
    print(f'CNN Performance for {algo} produced predictions {lr_accuracy}')

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 690us/step
CNN Performance for DT produced predictions 0.9182561307901907
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 689us/step
CNN Performance for KNN produced predictions 0.44686648501362397
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 666us/step
CNN Performance for LeNet5 produced predictions 0.9536784741144414
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 682us/step
CNN Performance for LR produced predictions 0.9536784741144414
