In [1]:
# Instal libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Load data
df = pd.read_csv('/content/Loan.csv')

In [3]:
df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


# Split Data

In [4]:
# Define features & target
X = df.drop(['RiskScore', 'LoanApproved'], axis=1)
y = df['RiskScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (16000, 34)
Test shape: (4000, 34)


# Data Preprocessing

In [5]:
# Drop column
X_train.drop(['ApplicationDate', 'Age', 'AnnualIncome', 'TotalAssets', 'BaseInterestRate'], axis=1, inplace=True)
X_test.drop(['ApplicationDate', 'Age', 'AnnualIncome', 'TotalAssets', 'BaseInterestRate'], axis=1, inplace=True)

In [6]:
# Feature engeneering
X_train['LoanDurationYears'] = X_train['LoanDuration'] / 12
X_test['LoanDurationYears'] = X_test['LoanDuration'] / 12
X_train.drop(columns=["LoanDuration"], inplace=True)
X_test.drop(columns=["LoanDuration"], inplace=True)

In [7]:
# OHE
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

ohe = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")

X_train_ohe = pd.DataFrame(
    ohe.fit_transform(X_train[cat_cols]),
    columns=ohe.get_feature_names_out(cat_cols),
    index=X_train.index
)
X_test_ohe = pd.DataFrame(
    ohe.transform(X_test[cat_cols]),
    columns=ohe.get_feature_names_out(cat_cols),
    index=X_test.index
)

# Replace original categorical columns with encoded ones
X_train = pd.concat([X_train.drop(columns=cat_cols), X_train_ohe], axis=1)
X_test  = pd.concat([X_test.drop(columns=cat_cols), X_test_ohe], axis=1)

In [8]:
# Scaling
num_cols = [
    'CreditScore', 'Experience', 'LoanAmount', 'LoanDurationYears',
    'MonthlyDebtPayments', 'CreditCardUtilizationRate',
    'NumberOfOpenCreditLines', 'NumberOfCreditInquiries',
    'DebtToIncomeRatio', 'PaymentHistory', 'LengthOfCreditHistory',
    'SavingsAccountBalance', 'CheckingAccountBalance',
    'TotalLiabilities', 'MonthlyIncome', 'UtilityBillsPaymentHistory',
    'JobTenure', 'NetWorth', 'InterestRate', 'MonthlyLoanPayment',
    'TotalDebtToIncomeRatio'
]

# Scale numeric features
scaler = RobustScaler()

# Scale only numeric columns
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train[num_cols]),
    columns=num_cols,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test[num_cols]),
    columns=num_cols,
    index=X_test.index
)

# Combine scaled numeric + non-scaled features
X_train_final = pd.concat(
    [X_train_scaled, X_train.drop(columns=num_cols, errors="ignore")],
    axis=1
)
X_test_final = pd.concat(
    [X_test_scaled, X_test.drop(columns=num_cols, errors="ignore")],
    axis=1
)

print(f"Train shape: {X_train_final.shape}")
print(f"Test shape: {X_test_final.shape}")

Train shape: (16000, 40)
Test shape: (4000, 40)


## Selected feature

In [9]:
selected_features = [
    'BankruptcyHistory', 'NetWorth',  'DebtToIncomeRatio', 'MonthlyIncome','TotalDebtToIncomeRatio',
    'PreviousLoanDefaults', 'InterestRate', 'LengthOfCreditHistory', 'CreditScore', 'EmploymentStatus_Unemployed',
    'CreditCardUtilizationRate', 'EmploymentStatus_Self-Employed', 'LoanDurationYears', 'Experience', 'EducationLevel_Master',
    'EducationLevel_High School', 'LoanAmount', 'EducationLevel_Doctorate', 'PaymentHistory', 'HomeOwnershipStatus_Rent'
  ]

X_train_sel = X_train_final[selected_features]
X_test_sel = X_test_final[selected_features]

print(f"Final features used for model: {len(selected_features)}")

Final features used for model: 20


# Modeling

In [10]:
# Train model
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_sel, y_train)

In [11]:
# Check model's performance
y_pred = model.predict(X_test_sel)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n📊 Model Performance (Top 20 Features):")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"R²   : {r2:.3f}")


📊 Model Performance (Top 20 Features):
MAE  : 1.481
RMSE : 2.313
R²   : 0.914


# Save trained model and preprocessors

In [12]:
joblib.dump(model, 'xgb_regressor.pkl')
joblib.dump(scaler, 'reg_scaler.pkl')
joblib.dump(ohe, 'encoder.pkl')

print("Model, scaler, and encoder saved successfully!")

Model, scaler, and encoder saved successfully!
