In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from Preprocessing import preprocess_data

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
from sklearn.model_selection import RandomizedSearchCV
import warnings

In [22]:
# Getting the train and test set
file_path = 'data\loan.csv'
target_column = 'LoanApproved'
X_train, X_test, y_train, y_test = preprocess_data(file_path, target_column, test_size=0.33,
                                                    random_state=42, apply_pca=True, pca_variance=0.95)

In [31]:
# Defining our performance metrics
def performance_metrics(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted)
    log = log_loss(true, predicted)

    return accuracy, precision, recall, f1, roc_auc, log

In [None]:
# listing all the models we are going to use
models = {
    'LogisticRegression':LogisticRegression(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'SVC':SVC(),
    'CatBoostClassifier':CatBoostClassifier(),
    'XGBClassifier':XGBClassifier(),
    'LGBMClassifier':LGBMClassifier()
}


def model_trainer(models, X_train, X_test, y_train, y_test):
    result = []
    for name, model in models.items():
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        accuracy_train, precision_train, recall_train, f1_train, roc_auc_train, log_loss_train = performance_metrics(y_train, y_train_pred)
        accuracy_test, precision_test, recall_test, f1_test, roc_auc_test, log_loss_test = performance_metrics(y_test, y_test_pred)

        result.append({
            "model":name,
            "accuracy_train":accuracy_train,
            "accuracy_test":accuracy_test,
            "precision_train":precision_train,
            "precision_test":precision_test,
            "recall_train":recall_train,
            "recall_test":recall_test,
            "f1_train":f1_train,
            "f1_test":f1_test,
            "roc_auc_train":roc_auc_train,
            "roc_auc_test":roc_auc_test,
            "log_loss_train":log_loss_train,
            "log_loss_test":log_loss_test,
        })

    result_df = pd.DataFrame(result)
    return result_df

In [33]:
model_trainer(models, X_train, X_test, y_train, y_test)

Learning rate set to 0.031204
0:	learn: 0.6393814	total: 461ms	remaining: 7m 40s
1:	learn: 0.5959185	total: 524ms	remaining: 4m 21s
2:	learn: 0.5515492	total: 578ms	remaining: 3m 12s
3:	learn: 0.5129607	total: 637ms	remaining: 2m 38s
4:	learn: 0.4759605	total: 713ms	remaining: 2m 21s
5:	learn: 0.4523956	total: 839ms	remaining: 2m 18s
6:	learn: 0.4289849	total: 874ms	remaining: 2m 4s
7:	learn: 0.4007523	total: 911ms	remaining: 1m 53s
8:	learn: 0.3797692	total: 1.23s	remaining: 2m 15s
9:	learn: 0.3585237	total: 1.57s	remaining: 2m 35s
10:	learn: 0.3403799	total: 1.61s	remaining: 2m 24s
11:	learn: 0.3236854	total: 1.65s	remaining: 2m 15s
12:	learn: 0.3082585	total: 1.73s	remaining: 2m 11s
13:	learn: 0.2977382	total: 1.77s	remaining: 2m 4s
14:	learn: 0.2842719	total: 1.82s	remaining: 1m 59s
15:	learn: 0.2761892	total: 1.85s	remaining: 1m 54s
16:	learn: 0.2649208	total: 1.89s	remaining: 1m 49s
17:	learn: 0.2554826	total: 1.93s	remaining: 1m 45s
18:	learn: 0.2460881	total: 1.96s	remaining: 1

Unnamed: 0,model,accuracy_train,accuracy_test,precision_train,precision_test,recall_train,recall_test,f1_train,f1_test,roc_auc_train,roc_auc_test,log_loss_train,log_loss_test
0,LogisticRegression,0.991269,0.992879,0.983164,0.989428,0.97974,0.981493,0.981449,0.985444,0.987283,0.989039,0.3147095,0.256675
1,DecisionTreeClassifier,1.0,0.935606,1.0,0.866871,1.0,0.871684,1.0,0.869271,1.0,0.914051,2.220446e-16,2.320993
2,RandomForestClassifier,1.0,0.964242,1.0,0.975945,1.0,0.876002,1.0,0.923277,1.0,0.934486,2.220446e-16,1.288834
3,KNeighborsClassifier,0.969627,0.946515,0.964238,0.947109,0.904717,0.828501,0.933529,0.883843,0.947183,0.906719,1.094759,1.927789
4,SVC,0.997463,0.993788,0.997136,0.992519,0.992086,0.98211,0.994605,0.987287,0.995604,0.98985,0.09145405,0.223908
5,CatBoostClassifier,1.0,0.991515,1.0,0.991829,1.0,0.973473,1.0,0.982565,1.0,0.985431,2.220446e-16,0.305825
6,XGBClassifier,1.0,0.987576,1.0,0.983658,1.0,0.965453,1.0,0.974471,1.0,0.980116,2.220446e-16,0.447815
7,LGBMClassifier,1.0,0.983636,1.0,0.97489,1.0,0.958051,1.0,0.966397,1.0,0.975008,2.220446e-16,0.589805


### Observations: 
- SVC is the best model without the need for hyperparameter tuning, it has a high accuracy(0.993788), precision(0.992519), recall(0.982110), f1(0.987287) and a low log loss on the test set
- DecisionTreeClassifier is our worst model
- Some models show signs of overfitting, having a very high accuracy(1.000000) on the train set which shows lack of generalization.
### Next Step:
- Perform hyperparameter tuning for overfitted models (e.g., RandomForest, Gradient Boosting).
- Reassess KNeighborsClassifier with tuned parameters to improve generalization.
- Use ensemble methods like stacking for further performance boosts.
- Analyze the results