# Process data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv("loan_approval_dataset.csv")
df = df.drop(columns=["loan_id"])
df.columns = df.columns.str.strip()
df.head(3)

df["loan_status"] = df["loan_status"].str.strip().map({'Rejected': 0, 'Approved': 1})

X = df.drop(columns=["loan_status"])
y = df["loan_status"]

# Identify numeric and categorical columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Split before scaling
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)

# Scale only numerical columns
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Load best model

In [2]:
# Instantiate with fixed hyperparameters
xgb = XGBClassifier(
    max_depth=7,
    learning_rate=0.1,
    n_estimators=100,
    eval_metric='logloss',
    random_state=42
)

# Train the model
xgb.fit(X_train, y_train)

# Evaluate
y_pred = xgb.predict(X_test)

## Demo pt.1

In [3]:
def predict_from_similar_scores(cibil_score, loan_amount, income_annum, loan_term):

    similar_rows = X_train[np.isclose(X_train['cibil_score'], cibil_score, atol=5)]

    if similar_rows.empty:
        similar_rows = X_train.copy()

    # Step 2: Create a "prototype" row
    prototype = pd.Series(dtype='float64')

    for col in X_train.columns:
        if col in numerical_cols:
            prototype[col] = similar_rows[col].mean()
        else:
            prototype[col] = similar_rows[col].mode()[0]

    prototype['cibil_score'] = cibil_score
    prototype['loan_amount'] = loan_amount
    prototype['income_annum'] = income_annum
    prototype['loan_term'] = loan_term
##
    sample_df = pd.DataFrame([prototype])
    sample_df[numerical_cols] = scaler.transform(sample_df[numerical_cols])

    pred = xgb.predict(sample_df)[0]

    print("Loan Approved!" if pred == 1 else "Loan Rejected.")

### Tung Tung Tung Tung Tung Sahur

In [4]:
predict_from_similar_scores(
    cibil_score=700,
    loan_amount=750000,
    income_annum=200000,
    loan_term=15
)

Loan Approved!


### Captain America

In [5]:
predict_from_similar_scores(
    cibil_score=10,
    loan_amount=100000,
    income_annum=1200000000,
    loan_term=25
)

Loan Rejected.


## Demo pt.2

In [6]:
results = X_test.copy()
results['true_label'] = y_test.values
results['pred_label'] = xgb.predict(X_test)
results['proba'] = xgb.predict_proba(X_test)[:, 1]
results['correct'] = results['true_label'] == results['pred_label']

results_original = results.copy()
results_original[numerical_cols] = scaler.inverse_transform(results[numerical_cols])

# High-confidence correct approvals
obvious_accepts = results_original[
    (results['true_label'] == 1) & 
    (results['pred_label'] == 1) & 
    (results['proba'] > 0.9)
]

# High-confidence correct rejections
obvious_rejects = results_original[
    (results['true_label'] == 0) & 
    (results['pred_label'] == 0) & 
    (results['proba'] < 0.1)
]

# Confident wrong predictions
strange_errors = results_original[
    (results['true_label'] != results['pred_label']) & 
    ((results['proba'] > 0.75) | (results['proba'] < 0.25))
]

extreme_rejects = obvious_rejects.sort_values(
    by=['cibil_score', 'income_annum', 'loan_amount'],
    ascending=[True, True, False]
)

### What will it predict?

In [7]:
### predict on row 2064
print(obvious_accepts.iloc[2].iloc[:-4])

no_of_dependents                   0.0
income_annum                 6100000.0
loan_amount                 23600000.0
loan_term                         20.0
cibil_score                      856.0
residential_assets_value     3300000.0
commercial_assets_value      5200000.0
luxury_assets_value         17700000.0
bank_asset_value             6400000.0
education_ Not Graduate          False
self_employed_ Yes               False
Name: 2064, dtype: object


In [8]:
row_std = X_test.loc[2064]
pred = xgb.predict(pd.DataFrame([row_std]))[0]
prob = xgb.predict_proba(pd.DataFrame([row_std]))[0][1]
print("Prediction:", "Approved" if pred == 1 else "Rejected")
print("Confidence:", f"{1-(pred-prob):.2%}")
print("Actual: ", "Approved" if y_test.loc[2064] == 1 else "Rejected")

Prediction: Approved
Confidence: 99.97%
Actual:  Approved


### What will it predict?

In [9]:
### predict on row 878
print(extreme_rejects.iloc[1].iloc[:-4])

no_of_dependents                   4.0
income_annum                 4400000.0
loan_amount                 17000000.0
loan_term                         18.0
cibil_score                      302.0
residential_assets_value     1000000.0
commercial_assets_value      2100000.0
luxury_assets_value         14000000.0
bank_asset_value             5700000.0
education_ Not Graduate           True
self_employed_ Yes               False
Name: 878, dtype: object


In [10]:
row_std = X_test.loc[878]
pred = xgb.predict(pd.DataFrame([row_std]))[0]
prob = xgb.predict_proba(pd.DataFrame([row_std]))[0][1]
print("Prediction:", "Approved" if pred == 1 else "Rejected")
print("Confidence:", f"{1-(prob-pred):.2%}")
print("Actual: ", "Approved" if y_test.loc[878] == 1 else "Rejected")

Prediction: Rejected
Confidence: 99.91%
Actual:  Rejected


### What will it predict?

In [11]:
### predict on row 2856
print(strange_errors.iloc[0].iloc[:-4])

no_of_dependents                   3.0
income_annum                 8300000.0
loan_amount                 31400000.0
loan_term                          6.0
cibil_score                      674.0
residential_assets_value     1000000.0
commercial_assets_value      1600000.0
luxury_assets_value         17200000.0
bank_asset_value             6100000.0
education_ Not Graduate           True
self_employed_ Yes                True
Name: 2856, dtype: object


In [12]:
row_std = X_test.loc[2856]
pred = xgb.predict(pd.DataFrame([row_std]))[0]
prob = xgb.predict_proba(pd.DataFrame([row_std]))[0][1]
print("Prediction:", "Approved" if pred == 1 else "Rejected")
print("Confidence:", f"{1-(pred-prob):.2%}")
print("Actual: ", "Approved" if y_test.loc[2856] == 1 else "Rejected")

Prediction: Approved
Confidence: 99.94%
Actual:  Rejected


In [13]:
strange_errors

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_ Not Graduate,self_employed_ Yes,true_label,pred_label,proba,correct
2856,3.0,8300000.0,31400000.0,6.0,674.0,1000000.0,1600000.0,17200000.0,6100000.0,True,True,0,1,0.999369,False
2142,0.0,2000000.0,6900000.0,2.0,448.0,2700000.0,3800000.0,7100000.0,1600000.0,True,False,1,0,0.175799,False
1174,4.0,9800000.0,27000000.0,2.0,395.0,18700000.0,3000000.0,23000000.0,9700000.0,False,True,0,1,0.750705,False
1475,5.0,9700000.0,28400000.0,4.0,494.0,5300000.0,12000000.0,24400000.0,10000000.0,True,False,0,1,0.941917,False
1950,0.0,200000.0,700000.0,10.0,587.0,-100000.0,100000.0,500000.0,100000.0,False,True,0,1,0.832182,False
2753,0.0,3800000.0,11700000.0,2.0,351.0,100000.0,6700000.0,12500000.0,3100000.0,False,True,1,0,0.121217,False
348,0.0,2000000.0,7000000.0,6.0,666.0,0.0,1200000.0,4300000.0,1400000.0,True,False,0,1,0.960929,False
2962,1.0,900000.0,2600000.0,4.0,346.0,400000.0,700000.0,2900000.0,900000.0,False,False,0,1,0.968203,False
3474,3.0,5400000.0,16800000.0,16.0,550.0,10700000.0,6400000.0,13400000.0,7000000.0,True,True,1,0,0.182846,False
