In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
df = pd.read_csv("../dataset/Loan_Default.csv")
df.head()


Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,Neg_ammortization,interest_only,lump_sum_payment,property_value,construction_type,occupancy_type,Secured_by,total_units,income,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,116500,,,,360.0,not_neg,not_int,not_lpsm,118000.0,sb,pr,home,1U,1740.0,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,206500,,,,360.0,not_neg,not_int,lpsm,,sb,pr,home,1U,4980.0,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,406500,4.56,0.2,595.0,360.0,neg_amm,not_int,not_lpsm,508000.0,sb,pr,home,1U,9480.0,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,456500,4.25,0.681,,360.0,not_neg,not_int,not_lpsm,658000.0,sb,pr,home,1U,11880.0,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,696500,4.0,0.3042,0.0,360.0,not_neg,not_int,not_lpsm,758000.0,sb,pr,home,1U,10440.0,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [2]:
def calculate_credit_worthiness_business(
    credit_score, income_usd, dtir1, open_credit, loan_amount_usd,
    business_or_commercial, LTV, term, rate_of_interest,
    neg_ammortization, interest_only, lump_sum_payment
):
    score = 0

    # Credit score
    if credit_score >= 700:
        score += 2
    elif credit_score >= 600:
        score += 1

    # Income (in USD)
    if income_usd > (50000/325):
        score += 2
    elif income_usd > (30000/325):
        score += 1

    # DTI
    if dtir1 < 35:
        score += 2
    elif dtir1 < 45:
        score += 1

    # Open credit lines
    if open_credit <= 3:
        score += 1
    elif open_credit > 7:
        score -= 1

    # Business or commercial loan
    if business_or_commercial == "YES":
        score += 1

    # LTV
    if LTV < 80:
        score += 1
    elif LTV > 90:
        score -= 1

    # Term
    if term < 240:
        score += 1

    # Rate of interest
    if rate_of_interest < 10:
        score += 1
    elif rate_of_interest > 15:
        score -= 1

    # Loan features
    if neg_ammortization == "No":
        score += 1
    if interest_only == "No":
        score += 1
    if lump_sum_payment == "Yes":
        score += 1

    # Final decision
    if score >= 10:
        return "Good"
    elif score >= 6:
        return "Average"
    else:
        return "Bad"

In [3]:
df=df.drop(columns=['Credit_Worthiness'])

In [4]:
df=df[(df['loan_amount'] >= 500000)]
df.shape

(23007, 33)

In [5]:
# Convert columns to appropriate types
df['open_credit'] = pd.to_numeric(df['open_credit'], errors='coerce')


df['Credit_Worthiness'] = df.apply(
    lambda row: calculate_credit_worthiness_business(
        row['Credit_Score'], row['income'], row['dtir1'], row['open_credit'],
        row['loan_amount'], row['business_or_commercial'], row['LTV'], row['term'], row['rate_of_interest'],
        row['Neg_ammortization'], row['interest_only'], row['lump_sum_payment']
    ),
    axis=1
)

In [6]:
df = df.drop(columns=[
    'Credit_Score', 'income', 'open_credit', 'age', 'term',
    'Neg_ammortization', 'interest_only', 'lump_sum_payment'
])


In [7]:
df

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,business_or_commercial,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,property_value,construction_type,occupancy_type,Secured_by,total_units,credit_type,co-applicant_credit_type,submission_of_application,LTV,Region,Security_Type,Status,dtir1,Credit_Worthiness
4,24894,2019,cf,Joint,pre,type1,p1,nob/c,696500,4.000,0.3042,0.00,758000.0,sb,pr,home,1U,CRIF,EXP,not_inst,91.886544,North,direct,0,39.0,Bad
5,24895,2019,cf,Joint,pre,type1,p1,nob/c,706500,3.990,0.1523,370.00,1008000.0,sb,pr,home,1U,EXP,EXP,not_inst,70.089286,North,direct,0,40.0,Average
19,24909,2019,cf,Sex Not Available,nopre,type1,p4,nob/c,586500,3.175,-0.3446,650.00,748000.0,sb,sr,home,1U,CIB,EXP,to_inst,78.409091,south,direct,0,31.0,Average
34,24924,2019,cf,Male,pre,type1,p1,nob/c,506500,4.250,0.1470,0.00,538000.0,sb,pr,home,1U,CIB,CIB,not_inst,94.144981,North,direct,0,36.0,Bad
37,24927,2019,cf,Joint,nopre,type1,p4,nob/c,656500,3.375,-0.1555,13016.46,1108000.0,sb,pr,home,1U,EXP,EXP,not_inst,59.250903,North,direct,0,42.0,Average
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148635,173525,2019,cf,Sex Not Available,nopre,type1,p3,nob/c,506500,3.750,0.1448,6305.00,758000.0,sb,pr,home,1U,CRIF,EXP,to_inst,66.820580,south,direct,0,31.0,Average
148639,173529,2019,cf,Sex Not Available,nopre,type2,p3,b/c,586500,3.625,0.7046,0.00,1788000.0,sb,pr,home,1U,CRIF,CIB,not_inst,32.802013,south,direct,0,50.0,Average
148645,173535,2019,cf,Joint,nopre,type3,p1,nob/c,506500,4.375,-0.0339,265.00,508000.0,sb,pr,home,1U,EXP,EXP,not_inst,99.704724,south,direct,0,42.0,Bad
148662,173552,2019,cf,Joint,nopre,type1,p4,nob/c,646500,3.625,0.0743,7639.80,828000.0,sb,pr,home,1U,CIB,EXP,not_inst,78.079710,North,direct,0,31.0,Average


In [8]:
df=df.drop(columns=['LTV'])

In [9]:
df['LTV'] = ( df['loan_amount']/df['property_value'])

In [10]:
df=df.drop(columns=['Interest_rate_spread'])

In [11]:
benchmark_rate = 6.5 
df['Interest_rate_spread'] = df['rate_of_interest'] - benchmark_rate

In [12]:
df=df.drop(columns=['Upfront_charges'])

In [13]:
df['Upfront_charges'] = df['loan_amount'] * 0.01


In [14]:
df

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,business_or_commercial,loan_amount,rate_of_interest,property_value,construction_type,occupancy_type,Secured_by,total_units,credit_type,co-applicant_credit_type,submission_of_application,Region,Security_Type,Status,dtir1,Credit_Worthiness,LTV,Interest_rate_spread,Upfront_charges
4,24894,2019,cf,Joint,pre,type1,p1,nob/c,696500,4.000,758000.0,sb,pr,home,1U,CRIF,EXP,not_inst,North,direct,0,39.0,Bad,0.918865,-2.500,6965.0
5,24895,2019,cf,Joint,pre,type1,p1,nob/c,706500,3.990,1008000.0,sb,pr,home,1U,EXP,EXP,not_inst,North,direct,0,40.0,Average,0.700893,-2.510,7065.0
19,24909,2019,cf,Sex Not Available,nopre,type1,p4,nob/c,586500,3.175,748000.0,sb,sr,home,1U,CIB,EXP,to_inst,south,direct,0,31.0,Average,0.784091,-3.325,5865.0
34,24924,2019,cf,Male,pre,type1,p1,nob/c,506500,4.250,538000.0,sb,pr,home,1U,CIB,CIB,not_inst,North,direct,0,36.0,Bad,0.941450,-2.250,5065.0
37,24927,2019,cf,Joint,nopre,type1,p4,nob/c,656500,3.375,1108000.0,sb,pr,home,1U,EXP,EXP,not_inst,North,direct,0,42.0,Average,0.592509,-3.125,6565.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148635,173525,2019,cf,Sex Not Available,nopre,type1,p3,nob/c,506500,3.750,758000.0,sb,pr,home,1U,CRIF,EXP,to_inst,south,direct,0,31.0,Average,0.668206,-2.750,5065.0
148639,173529,2019,cf,Sex Not Available,nopre,type2,p3,b/c,586500,3.625,1788000.0,sb,pr,home,1U,CRIF,CIB,not_inst,south,direct,0,50.0,Average,0.328020,-2.875,5865.0
148645,173535,2019,cf,Joint,nopre,type3,p1,nob/c,506500,4.375,508000.0,sb,pr,home,1U,EXP,EXP,not_inst,south,direct,0,42.0,Bad,0.997047,-2.125,5065.0
148662,173552,2019,cf,Joint,nopre,type1,p4,nob/c,646500,3.625,828000.0,sb,pr,home,1U,CIB,EXP,not_inst,North,direct,0,31.0,Average,0.780797,-2.875,6465.0


In [15]:
df=df.drop(columns=['loan_amount','property_value','loan_type','loan_purpose','rate_of_interest','construction_type'])

In [16]:
df

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,business_or_commercial,occupancy_type,Secured_by,total_units,credit_type,co-applicant_credit_type,submission_of_application,Region,Security_Type,Status,dtir1,Credit_Worthiness,LTV,Interest_rate_spread,Upfront_charges
4,24894,2019,cf,Joint,pre,nob/c,pr,home,1U,CRIF,EXP,not_inst,North,direct,0,39.0,Bad,0.918865,-2.500,6965.0
5,24895,2019,cf,Joint,pre,nob/c,pr,home,1U,EXP,EXP,not_inst,North,direct,0,40.0,Average,0.700893,-2.510,7065.0
19,24909,2019,cf,Sex Not Available,nopre,nob/c,sr,home,1U,CIB,EXP,to_inst,south,direct,0,31.0,Average,0.784091,-3.325,5865.0
34,24924,2019,cf,Male,pre,nob/c,pr,home,1U,CIB,CIB,not_inst,North,direct,0,36.0,Bad,0.941450,-2.250,5065.0
37,24927,2019,cf,Joint,nopre,nob/c,pr,home,1U,EXP,EXP,not_inst,North,direct,0,42.0,Average,0.592509,-3.125,6565.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148635,173525,2019,cf,Sex Not Available,nopre,nob/c,pr,home,1U,CRIF,EXP,to_inst,south,direct,0,31.0,Average,0.668206,-2.750,5065.0
148639,173529,2019,cf,Sex Not Available,nopre,b/c,pr,home,1U,CRIF,CIB,not_inst,south,direct,0,50.0,Average,0.328020,-2.875,5865.0
148645,173535,2019,cf,Joint,nopre,nob/c,pr,home,1U,EXP,EXP,not_inst,south,direct,0,42.0,Bad,0.997047,-2.125,5065.0
148662,173552,2019,cf,Joint,nopre,nob/c,pr,home,1U,CIB,EXP,not_inst,North,direct,0,31.0,Average,0.780797,-2.875,6465.0


In [17]:
df['occupancy_type'].value_counts()

occupancy_type
pr    21862
ir      903
sr      242
Name: count, dtype: int64

In [18]:
df=df.drop(columns=['occupancy_type'])

In [19]:
df

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,business_or_commercial,Secured_by,total_units,credit_type,co-applicant_credit_type,submission_of_application,Region,Security_Type,Status,dtir1,Credit_Worthiness,LTV,Interest_rate_spread,Upfront_charges
4,24894,2019,cf,Joint,pre,nob/c,home,1U,CRIF,EXP,not_inst,North,direct,0,39.0,Bad,0.918865,-2.500,6965.0
5,24895,2019,cf,Joint,pre,nob/c,home,1U,EXP,EXP,not_inst,North,direct,0,40.0,Average,0.700893,-2.510,7065.0
19,24909,2019,cf,Sex Not Available,nopre,nob/c,home,1U,CIB,EXP,to_inst,south,direct,0,31.0,Average,0.784091,-3.325,5865.0
34,24924,2019,cf,Male,pre,nob/c,home,1U,CIB,CIB,not_inst,North,direct,0,36.0,Bad,0.941450,-2.250,5065.0
37,24927,2019,cf,Joint,nopre,nob/c,home,1U,EXP,EXP,not_inst,North,direct,0,42.0,Average,0.592509,-3.125,6565.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148635,173525,2019,cf,Sex Not Available,nopre,nob/c,home,1U,CRIF,EXP,to_inst,south,direct,0,31.0,Average,0.668206,-2.750,5065.0
148639,173529,2019,cf,Sex Not Available,nopre,b/c,home,1U,CRIF,CIB,not_inst,south,direct,0,50.0,Average,0.328020,-2.875,5865.0
148645,173535,2019,cf,Joint,nopre,nob/c,home,1U,EXP,EXP,not_inst,south,direct,0,42.0,Bad,0.997047,-2.125,5065.0
148662,173552,2019,cf,Joint,nopre,nob/c,home,1U,CIB,EXP,not_inst,North,direct,0,31.0,Average,0.780797,-2.875,6465.0


In [20]:
df['Secured_by'].value_counts()

Secured_by
home    23007
Name: count, dtype: int64

In [21]:
df['total_units'].value_counts()

total_units
1U    22188
2U      475
4U      178
3U      166
Name: count, dtype: int64

In [22]:
df['credit_type'].value_counts()

credit_type
CIB     7647
CRIF    6833
EXP     6232
EQUI    2295
Name: count, dtype: int64

In [23]:
df=df.drop(columns=['Security_Type'])

In [24]:
df

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,business_or_commercial,Secured_by,total_units,credit_type,co-applicant_credit_type,submission_of_application,Region,Status,dtir1,Credit_Worthiness,LTV,Interest_rate_spread,Upfront_charges
4,24894,2019,cf,Joint,pre,nob/c,home,1U,CRIF,EXP,not_inst,North,0,39.0,Bad,0.918865,-2.500,6965.0
5,24895,2019,cf,Joint,pre,nob/c,home,1U,EXP,EXP,not_inst,North,0,40.0,Average,0.700893,-2.510,7065.0
19,24909,2019,cf,Sex Not Available,nopre,nob/c,home,1U,CIB,EXP,to_inst,south,0,31.0,Average,0.784091,-3.325,5865.0
34,24924,2019,cf,Male,pre,nob/c,home,1U,CIB,CIB,not_inst,North,0,36.0,Bad,0.941450,-2.250,5065.0
37,24927,2019,cf,Joint,nopre,nob/c,home,1U,EXP,EXP,not_inst,North,0,42.0,Average,0.592509,-3.125,6565.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148635,173525,2019,cf,Sex Not Available,nopre,nob/c,home,1U,CRIF,EXP,to_inst,south,0,31.0,Average,0.668206,-2.750,5065.0
148639,173529,2019,cf,Sex Not Available,nopre,b/c,home,1U,CRIF,CIB,not_inst,south,0,50.0,Average,0.328020,-2.875,5865.0
148645,173535,2019,cf,Joint,nopre,nob/c,home,1U,EXP,EXP,not_inst,south,0,42.0,Bad,0.997047,-2.125,5065.0
148662,173552,2019,cf,Joint,nopre,nob/c,home,1U,CIB,EXP,not_inst,North,0,31.0,Average,0.780797,-2.875,6465.0


In [25]:
y = df['Status']
X = df[['loan_limit', 'LTV', 'dtir1','Credit_Worthiness','credit_type','Interest_rate_spread','Region','business_or_commercial']]
X.columns

Index(['loan_limit', 'LTV', 'dtir1', 'Credit_Worthiness', 'credit_type',
       'Interest_rate_spread', 'Region', 'business_or_commercial'],
      dtype='object')

In [26]:
X = X.fillna(X.median(numeric_only=True))
X = pd.get_dummies(X, drop_first=True)
X.isna().sum()

LTV                             0
dtir1                           0
Interest_rate_spread            0
loan_limit_ncf                  0
Credit_Worthiness_Bad           0
credit_type_CRIF                0
credit_type_EQUI                0
credit_type_EXP                 0
Region_North-East               0
Region_central                  0
Region_south                    0
business_or_commercial_nob/c    0
dtype: int64

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [28]:
# %pip install imbalanced-learn

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X_train, y_train)

In [29]:
Y_resampled.value_counts()

Status
0    14237
1    14237
Name: count, dtype: int64

In [30]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,   # limit tree depth
    random_state=42
)


# Train
rf.fit(X_resampled, Y_resampled)


In [31]:
y_pred= rf.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 1, 0])

In [32]:
rf.score(X_test, y_test)

0.9404606692742286

In [33]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=[str(c) for c in rf.classes_]))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      3560
           1       0.81      0.95      0.88      1042

    accuracy                           0.94      4602
   macro avg       0.90      0.95      0.92      4602
weighted avg       0.95      0.94      0.94      4602



In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1": f1_score(y_test, preds),
        "ROC-AUC": roc_auc_score(y_test, preds)
    }

print("Random Forest:", evaluate(rf, X_test, y_test))


Random Forest: {'Accuracy': 0.9404606692742286, 'Precision': 0.8142389525368249, 'Recall': 0.95489443378119, 'F1': 0.8789752650176678, 'ROC-AUC': 0.9455651944186848}


In [35]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X, y, cv=5, scoring="accuracy")
print("Cross-validation Accuracy:", scores.mean())


Cross-validation Accuracy: 0.9158948402370062


In [36]:
import joblib
joblib.dump(rf, 'business_loan_default_model.pkl')

['business_loan_default_model.pkl']

In [37]:
import joblib
joblib.dump(list(X.columns), 'business_X_columns.pkl')

['business_X_columns.pkl']