In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('credit_risk_dataset.csv')

In [None]:
df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].median())
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].median())

In [None]:
age_threshold = 85
count_age_exceeding = (df['person_age'] > age_threshold).sum()

# Count samples exceeding employment length threshold
employment_threshold = 40
count_employment_exceeding = (df['person_emp_length'] > employment_threshold).sum()

# Apply trimming (removing these samples)
df = df[(df['person_age'] <= age_threshold) & (df['person_emp_length'] <= employment_threshold)]

# Print results
print(f"Trimming applied for Person Age > {age_threshold}: {count_age_exceeding} samples removed.")
print(f"Trimming applied for Employment Length > {employment_threshold}: {count_employment_exceeding} samples removed.")

Trimming applied for Person Age > 85: 6 samples removed.
Trimming applied for Employment Length > 40: 3 samples removed.


In [None]:
cap_value = np.percentile(df['person_income'], 95)
count_exceeding = (df['person_income'] > cap_value).sum()

# Apply capping at 99 percentile for person income feature
df['person_income'] = np.where(df['person_income'] > cap_value, cap_value, df['person_income'])

# Recalculate loan_percent_income after capping
df['loan_percent_income'] = df['loan_amnt'] / df['person_income']

print(f"Capping applied at: {cap_value}")
print(f"Number of samples exceeding the 99th percentile ({cap_value}): {count_exceeding}")

Capping applied at: 138000.0
Number of samples exceeding the 99th percentile (138000.0): 1613


In [None]:
# Apply one hot encoding
ohe_cols = ['person_home_ownership', 'loan_intent']
df = pd.get_dummies(df, columns=ohe_cols)
df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) # Boolean to int(0,1)

  df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) # Boolean to int(0,1)
  df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) # Boolean to int(0,1)
  df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) # Boolean to int(0,1)
  df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) # Boolean to int(0,1)
  df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) # Boolean to int(0,1)
  df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) # Boolean to int(0,1)
  df.loc[:, df.columns.str.startswith(tuple(ohe_cols))] = df.loc[:, df.columns.str.startswith(tuple(ohe_cols))].astype(int) 

In [None]:
# Define mappings for label encoding
label_mappings = {
    'loan_grade': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6},
    'cb_person_default_on_file': {'N': 0, 'Y': 1}
}

# Apply the mapping
for col, mapping in label_mappings.items():
    df[col] = df[col].map(mapping)

# Finding the optimised loan amount

In [None]:
import pickle

with open("xgb_creditRisk.pkl", "rb") as f:
    model = pickle.load(f)

In [None]:
def binary_search_approved(user_input, model, min_amt=200, max_amt=500):

    user_input = user_input.copy()
    user_input.pop("Index", None)

    left, right = min_amt, max_amt
    best = None

    while left <= right:
        mid = (left + right) // 2
        temp = user_input.copy()
        temp["loan_amnt"] = mid
        temp["loan_percent_income"] = mid / temp["person_income"]

        input_df = pd.DataFrame([temp])

        # Drop extra columns that aren't part of model input
        if "loan_amnt_optimised" in input_df.columns:
            input_df = input_df.drop(columns=["loan_amnt_optimised"])
        if "loan_status" in input_df.columns:
            input_df = input_df.drop(columns=["loan_status"])

        pred = model.predict(input_df)[0]

        if pred == 0:
            best = mid
            left = mid + 1
        else:
            right = mid - 1
    return best

In [None]:
from tqdm import tqdm

for row in tqdm(df.itertuples(index=True), total=len(df)):
    idx = row.Index
    user_input = row._asdict()  # includes loan_status and loan_amnt_optimised

    if row.loan_status == 0:
        df.at[idx, "loan_amnt_optimised"] = row.loan_amnt
    else:
        max_approved = binary_search_approved(
            user_input, model, min_amt=200, max_amt=int(row.loan_amnt)
        )
        df.at[idx, "loan_amnt_optimised"] = max_approved if max_approved is not None else 0



100%|██████████| 32572/32572 [10:42<00:00, 50.67it/s]


In [None]:
df.head(5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,...,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_amnt_optimised
1,21,9600.0,5.0,1,1000,11.14,0,0.104167,0,2,...,0,1,0,0,1,0,0,0,0,1000.0
2,25,9600.0,1.0,2,5500,12.87,1,0.572917,0,3,...,0,0,0,0,0,0,1,0,0,1476.0
3,23,65500.0,4.0,2,35000,15.23,1,0.534351,0,2,...,0,0,1,0,0,0,1,0,0,19934.0
4,24,54400.0,8.0,2,35000,14.27,1,0.643382,1,4,...,0,0,1,0,0,0,1,0,0,16556.0
5,21,9900.0,2.0,0,2500,7.14,1,0.252525,0,2,...,0,1,0,0,0,0,0,0,1,1523.0


In [None]:
from google.colab import files

# Save the DataFrame to a CSV file
df.to_csv("optimized_loans.csv", index=False)

In [None]:
(df['loan_amnt_optimised'] == 0).sum()

np.int64(0)

In [None]:
df['loan_amnt_optimised'].describe()

Unnamed: 0,loan_amnt_optimised
count,32572.0
mean,8422.907743
std,6316.419184
min,0.0
25%,4000.0
50%,7200.0
75%,12000.0
max,35000.0


In [None]:
df['loan_amnt_optimised'] = df['loan_amnt_optimised'].replace(0, 200)

# Training model for the loan amount prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv('optimized_loans.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32572 entries, 0 to 32571
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      32572 non-null  int64  
 1   person_income                   32572 non-null  float64
 2   person_emp_length               32572 non-null  float64
 3   loan_grade                      32572 non-null  int64  
 4   loan_amnt                       32572 non-null  int64  
 5   loan_int_rate                   32572 non-null  float64
 6   loan_status                     32572 non-null  int64  
 7   loan_percent_income             32572 non-null  float64
 8   cb_person_default_on_file       32572 non-null  int64  
 9   cb_person_cred_hist_length      32572 non-null  int64  
 10  person_home_ownership_MORTGAGE  32572 non-null  int64  
 11  person_home_ownership_OTHER     32572 non-null  int64  
 12  person_home_ownership_OWN       

In [None]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,...,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_amnt_optimised
0,21,9600.0,5.0,1,1000,11.14,0,0.104167,0,2,...,0,1,0,0,1,0,0,0,0,1000.0
1,25,9600.0,1.0,2,5500,12.87,1,0.572917,0,3,...,0,0,0,0,0,0,1,0,0,1476.0
2,23,65500.0,4.0,2,35000,15.23,1,0.534351,0,2,...,0,0,1,0,0,0,1,0,0,19934.0
3,24,54400.0,8.0,2,35000,14.27,1,0.643382,1,4,...,0,0,1,0,0,0,1,0,0,16556.0
4,21,9900.0,2.0,0,2500,7.14,1,0.252525,0,2,...,0,1,0,0,0,0,0,0,1,1523.0


In [None]:
# X = df.drop(columns=["loan_amnt", "loan_status","loan_percent_income","loan_amnt_optimised"])
X = df.drop(columns=["loan_status","loan_amnt_optimised"])
y = df["loan_amnt_optimised"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# loan_amnt_train = df.loc[X_train.index, "loan_amnt"].values
# loan_amnt_test = df.loc[X_test.index, "loan_amnt"].values

In [None]:
# Train a regression model (Random Forest)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")

MAE: 226.74
R² Score: 0.9788


In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("Linear Regression:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("R² Score:", r2_score(y_test, y_pred_lr))

Linear Regression:
RMSE: 2967.2832179499055
R² Score: 0.7751233635720334


In [None]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Regressor:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))
print("R² Score:", r2_score(y_test, y_pred_xgb))


XGBoost Regressor:
RMSE: 1063.2454850106942
R² Score: 0.9711268989761297


In [None]:
df.shape

(32572, 20)

In [None]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_amnt_optimised
0,21,9600.0,5.0,1,11.14,0.104167,0,2,0,0,1,0,0,1,0,0,0,0,1000.0
1,25,9600.0,1.0,2,12.87,0.572917,0,3,1,0,0,0,0,0,0,1,0,0,1476.0
2,23,65500.0,4.0,2,15.23,0.534351,0,2,0,0,0,1,0,0,0,1,0,0,19934.0
3,24,54400.0,8.0,2,14.27,0.643382,1,4,0,0,0,1,0,0,0,1,0,0,16556.0
4,21,9900.0,2.0,0,7.14,0.252525,0,2,0,0,1,0,0,0,0,0,0,1,1523.0


In [None]:
with open("xgb_loanAmountOptimiser.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

print("Model saved as xgb_loanAmountOptimiser.pkl")

Model saved as xgb_loanAmountModel.pkl
