In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [2]:
# Seed for reproducibility
np.random.seed(42)

# Generate synthetic dataset
n_samples = 1000

In [7]:
# Applicant's Age (between 20 and 60 years)
age = np.random.randint(20, 61, size=n_samples)

# Applicant's Income (monthly income between $1000 and $10000)
income = np.random.randint(1000, 10001, size=n_samples)

# Credit Score (between 300 and 850)
credit_score = np.random.randint(300, 851, size=n_samples)

# Loan Amount (between $5000 and $500,000)
loan_amount = np.random.randint(5000, 500001, size=n_samples)

# Loan Tenure (between 6 and 360 months)
loan_tenure = np.random.randint(6, 361, size=n_samples)

# Current EMIs sum (random between 0 and $50,000)
current_emis_sum = np.random.randint(0, 50001, size=n_samples)

# Employment status (randomly assigned as 'Employed', 'Unemployed', 'Business Owner', 'Student')
employment_status = np.random.choice(['Employed', 'Unemployed', 'Business Owner', 'Student'], size=n_samples)

# Debt-to-income ratio = current_emis_sum / income (ensuring income != 0)
dti = current_emis_sum / income

# Loan Approval Status (random binary label based on conditions)
loan_approval = []
for i in range(n_samples):
    # Simple rule to simulate loan approval: credit score > 600, DTI < 0.4, and income > 3000
    if (credit_score[i] > 600) and (dti[i] < 0.4) and (income[i] > 3000):
        loan_approval.append(1)  # Approved
    else:
        loan_approval.append(0)  # Rejected

# Create DataFrame
df = pd.DataFrame({
    'age': age,
    'income': income,
    'credit_score': credit_score,
    'loan_amount': loan_amount,
    'loan_tenure': loan_tenure,
    'current_emis_sum': current_emis_sum,
    'dti': dti,
    'employment_status': employment_status,
    'loan_approval': loan_approval
})
df

Unnamed: 0,age,income,credit_score,loan_amount,loan_tenure,current_emis_sum,dti,employment_status,loan_approval
0,38,7985,429,343264,144,3679,0.460739,Employed,0
1,37,2782,592,19780,130,29489,10.599928,Business Owner,0
2,56,5541,431,42905,209,40308,7.274499,Employed,0
3,49,4257,655,419789,66,9488,2.228800,Employed,0
4,38,7213,353,121049,255,39842,5.523638,Employed,0
...,...,...,...,...,...,...,...,...,...
995,37,7853,716,135066,16,17601,2.241309,Unemployed,0
996,46,8709,469,143982,292,27796,3.191641,Student,0
997,60,2936,616,269837,261,6007,2.045981,Student,0
998,27,4091,473,120313,17,20514,5.014422,Unemployed,0


In [8]:
# Encode employment status
le = LabelEncoder()
df['employment_status_encoded'] = le.fit_transform(df['employment_status'])

# Drop original employment status column
df.drop('employment_status', axis=1, inplace=True)

# Separate features and target variable
X = df.drop('loan_approval', axis=1)
y = df['loan_approval']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train an XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X, y)

Parameters: { "use_label_encoder" } are not used.



In [13]:
# Save the model to a pickle file for later use
import pickle
with open('./pickles/loan_approval_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)