In [19]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
# Load data
df = pd.read_csv('/workspaces/Credit-Risk-Prediction-and-Automated-Loan-Approval-System/dataset/Loan.csv')

In [21]:
df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


# Split Data

In [22]:
# Define features & target
X = df.drop(['RiskScore', 'LoanApproved'], axis=1)
y = df['RiskScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (16000, 34)
Test shape: (4000, 34)


# Data Preprocessing

In [23]:
# Feature engeneering
X_train['LoanDurationYears'] = X_train['LoanDuration'] / 12
X_test['LoanDurationYears'] = X_test['LoanDuration'] / 12

In [24]:
# Encode categorical features
cat_cols = X_train.select_dtypes(include='object').columns.tolist()
print("Categorical Columns:", cat_cols)

ohe = OneHotEncoder(drop=None, sparse_output=False, handle_unknown='ignore')

X_train_ohe = pd.DataFrame(
    ohe.fit_transform(X_train[cat_cols]),
    columns=ohe.get_feature_names_out(cat_cols),
    index=X_train.index
)

X_test_ohe = pd.DataFrame(
    ohe.transform(X_test[cat_cols]),
    columns=ohe.get_feature_names_out(cat_cols),
    index=X_test.index
)

# Merge back encoded columns
X_train = pd.concat([X_train.drop(cat_cols, axis=1), X_train_ohe], axis=1)
X_test = pd.concat([X_test.drop(cat_cols, axis=1), X_test_ohe], axis=1)

Categorical Columns: ['ApplicationDate', 'EmploymentStatus', 'EducationLevel', 'MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose']


## Selected feature

In [25]:
selected_features = [
    'EmploymentStatus_Unemployed', 'EmploymentStatus_Self-Employed', 'EmploymentStatus_Employed', 'MonthlyIncome', 'NetWorth',
    'DebtToIncomeRatio', 'CreditScore', 'CreditCardUtilizationRate', 'PreviousLoanDefaults', 'BankruptcyHistory',
    'LengthOfCreditHistory', 'LoanAmount', 'LoanDurationYears', 'InterestRate']

## Scaling

In [26]:
scaler = RobustScaler()

X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train[selected_features]),
    columns=selected_features,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test[selected_features]),
    columns=selected_features,
    index=X_test.index
)

# Modeling

In [27]:
# Train model
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [28]:
# Check model's performance
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nðŸ“Š Model Performance (Top 20 Features):")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"RÂ²   : {r2:.3f}")


ðŸ“Š Model Performance (Top 20 Features):
MAE  : 1.576
RMSE : 2.485
RÂ²   : 0.901


# Save trained model and preprocessors

In [None]:
os.makedirs("models", exist_ok=True)

# Save trained components
joblib.dump(model, '../models/xgb_regressor.pkl')
joblib.dump(scaler, '../models/reg_scaler.pkl')
joblib.dump(ohe, '../models/reg_encoder.pkl')

print("Model, scaler, and encoder saved successfully!")

Model, scaler, and encoder saved successfully!


: 