In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 

In [6]:
df = pd.read_csv("Loan_Default_Cleaned.csv")
df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [11]:
x = df.drop( ["ID"], axis=1)

In [12]:
# numeric and categorical separation
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# fill missing numeric values
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# fill missing categorical values
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

In [13]:
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [16]:
x = df.drop("Status", axis=1)
y = df["Status"]

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
xgb = XGBClassifier(eval_metric="logloss")

xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict_proba(x_test)[:,1]

auc_xgb = roc_auc_score(y_test, y_pred_xgb)
print("XGBoost AUC-ROC:", auc_xgb)

XGBoost AUC-ROC: 0.99998748714669


In [20]:
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1]
}

grid_xgb = GridSearchCV(
    estimator=XGBClassifier(eval_metric="logloss"),
    param_grid=xgb_param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1
)

grid_xgb.fit(x_train, y_train)

print("Best XGBoost Params:", grid_xgb.best_params_)
print("Best XGBoost AUC:", grid_xgb.best_score_)

Best XGBoost Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best XGBoost AUC: 0.9999928071063083


In [21]:
lgbm = LGBMClassifier()

lgbm.fit(x_train, y_train)
y_pred_lgbm = lgbm.predict_proba(x_test)[:,1]

auc_lgbm = roc_auc_score(y_test, y_pred_lgbm)
print("LightGBM AUC-ROC:", auc_lgbm)

[LightGBM] [Info] Number of positive: 29311, number of negative: 89625
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2189
[LightGBM] [Info] Number of data points in the train set: 118936, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.246443 -> initscore=-1.117671
[LightGBM] [Info] Start training from score -1.117671
LightGBM AUC-ROC: 1.0


In [22]:
lgbm_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [-1, 5, 10],
    'learning_rate': [0.01, 0.1]
}

grid_lgbm = GridSearchCV(
    estimator=LGBMClassifier(),
    param_grid=lgbm_param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1
)

grid_lgbm.fit(x_train, y_train)

print("Best LightGBM Params:", grid_lgbm.best_params_)
print("Best LightGBM AUC:", grid_lgbm.best_score_)

[LightGBM] [Info] Number of positive: 29311, number of negative: 89625
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2189
[LightGBM] [Info] Number of data points in the train set: 118936, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.246443 -> initscore=-1.117671
[LightGBM] [Info] Start training from score -1.117671
Best LightGBM Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Best LightGBM AUC: 0.9999927795369582


In [23]:
print("Baseline Logistic Regression AUC: <PUT YOUR VALUE HERE>")
print("XGBoost AUC:", auc_xgb)
print("LightGBM AUC:", auc_lgbm)

print("\nBest Tuned XGBoost AUC:", grid_xgb.best_score_)
print("Best Tuned LightGBM AUC:", grid_lgbm.best_score_)

Baseline Logistic Regression AUC: <PUT YOUR VALUE HERE>
XGBoost AUC: 0.99998748714669
LightGBM AUC: 1.0

Best Tuned XGBoost AUC: 0.9999928071063083
Best Tuned LightGBM AUC: 0.9999927795369582
