In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [4]:
chunks = pd.read_csv("data/Loan_status_2007-2020Q3.csv", chunksize=100000, low_memory=False)

In [3]:
# data source: https://www.kaggle.com/datasets/ethon0426/lending-club-20072020q1
# 2.9M loan entries with 142 attributes
cols_to_use = ['loan_amnt',
 'term',
 'int_rate',
 'installment',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'loan_status',
 'purpose',
 'dti',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_d',
 'last_pymnt_amnt',
 'avg_cur_bal'
 ]

df = pd.read_csv("data/Loan_status_2007-2020Q3.csv", usecols=cols_to_use, low_memory=False)

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
# Manually sample each group and concatenate
sampled_fully_paid = df[df['loan_status'] == 'Fully Paid'].sample(n=20000, random_state=42)
sampled_charged_off = df[df['loan_status'] == 'Charged Off'].sample(n=20000, random_state=42)

df_sampled = pd.concat([sampled_fully_paid, sampled_charged_off], ignore_index=True)

In [6]:
df_sampled.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,fico_range_low,fico_range_high,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,avg_cur_bal
0,19950.0,60 months,19.52%,523.24,E,E3,Senior Business Analyst,8 years,MORTGAGE,111000.0,Source Verified,Fully Paid,house,8.06,715.0,719.0,0.0,8.0,0.0,25092.0,82.3%,0.0,0.0,Sep-2016,16558.1,66703.0
1,7000.0,36 months,16.29%,247.11,D,D2,Program Manager,10+ years,MORTGAGE,155500.0,Source Verified,Fully Paid,other,9.3,670.0,674.0,1.0,11.0,0.0,98455.0,77.8%,0.0,0.0,Nov-2015,4377.82,57119.0
2,7550.0,36 months,12.12%,251.21,B,B3,,,MORTGAGE,28000.0,Verified,Fully Paid,credit_card,5.87,710.0,714.0,0.0,7.0,0.0,4742.0,46%,0.0,0.0,Jan-2016,250.82,4112.0
3,16000.0,36 months,14.03%,547.08,C,C2,Safet Director,3 years,RENT,115000.0,Source Verified,Fully Paid,debt_consolidation,15.08,680.0,684.0,1.0,11.0,2.0,20593.0,64%,0.0,0.0,Jun-2019,12040.1,5257.0
4,10950.0,36 months,14.99%,379.54,C,C5,,,MORTGAGE,68000.0,Verified,Fully Paid,debt_consolidation,34.73,680.0,684.0,0.0,13.0,0.0,22528.0,78.2%,0.0,0.0,Sep-2017,379.18,22100.0


In [7]:
df_sampled = df_sampled.dropna()

In [8]:
df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35412 entries, 0 to 39999
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   loan_amnt                35412 non-null  float64
 1   term                     35412 non-null  object 
 2   int_rate                 35412 non-null  object 
 3   installment              35412 non-null  float64
 4   grade                    35412 non-null  object 
 5   sub_grade                35412 non-null  object 
 6   emp_title                35412 non-null  object 
 7   emp_length               35412 non-null  object 
 8   home_ownership           35412 non-null  object 
 9   annual_inc               35412 non-null  float64
 10  verification_status      35412 non-null  object 
 11  loan_status              35412 non-null  object 
 12  purpose                  35412 non-null  object 
 13  dti                      35412 non-null  float64
 14  fico_range_low           35

In [34]:
df.term.unique()


array([' 36 months', ' 60 months', nan], dtype=object)

In [27]:
category_orders = {
    'loan_status': ['Charged Off', 'Fully Paid'],
    'grade': ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
    'term': ['36 months', '60 months'],
    'emp_length' : ['< 1 year', '1 year','2 years','3 years','4 years', '5 years',  
                         '6 years', '7 years', '8 years', '9 years','10+ years'],
    'sub_grade':['A1','A2','A3', 'A4','A5','B1','B2','B3','B4','B5',
                 'C1','C2','C3', 'C4','C5','D1','D2','D3','D4','D5',
                 'E1','E2','E3', 'E4','E5','F1','F2','F3','F4','F5',
                 'G1','G2','G3', 'G4','G5'],
    'home_ownership': ['MORTGAGE', 'RENT', 'OWN', 'ANY'],
    'verification_status': ['Not Verified', 'Source Verified', 'Verified'],
    'purpose': [ 'other', 'moving','credit_card','major_purchase','vacation', 'medical', 'debt_consolidation',          
       'car','wedding', 'small_business',  'renewable_energy', 'home_improvement', 'house']
    
}

In [28]:

# Apply encoding using pd.Categorical
for col, order in category_orders.items():
    df_sampled[col] = pd.Categorical(df_sampled[col], categories=order, ordered=True)
    df_sampled[f'{col}_encoded'] = df_sampled[col].cat.codes

In [37]:
cols_to_convert = ['int_rate', 'dti', 'revol_util']  

for col in cols_to_convert:
    df_sampled[col] = (
        df[col]
        .astype(str)
        .str.replace('[%,,$,]', '', regex=True)  # Remove % or $ if present
        .str.replace(',', '')                   # Remove commas
        .replace('n/a', None)                   # Handle non-numeric strings
    )
    df_sampled[col] = pd.to_numeric(df[col], errors='coerce') 

In [38]:
df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35412 entries, 0 to 39999
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   loan_amnt                    35412 non-null  float64 
 1   term                         0 non-null      category
 2   int_rate                     35412 non-null  float64 
 3   installment                  35412 non-null  float64 
 4   grade                        35412 non-null  category
 5   sub_grade                    35412 non-null  category
 6   emp_title                    35412 non-null  object  
 7   emp_length                   35412 non-null  category
 8   home_ownership               35412 non-null  category
 9   annual_inc                   35412 non-null  float64 
 10  verification_status          35412 non-null  category
 11  loan_status                  35412 non-null  category
 12  purpose                      35412 non-null  category
 13  dti   

In [41]:
df_norm = df_sampled.select_dtypes(include=['float64', 'int8'])

In [42]:
df_norm.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,fico_range_low,fico_range_high,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,recoveries,collection_recovery_fee,last_pymnt_amnt,avg_cur_bal,loan_status_encoded,grade_encoded,term_encoded,emp_length_encoded,sub_grade_encoded,home_ownership_encoded,verification_status_encoded,purpose_encoded
0,19950.0,10.65,523.24,111000.0,27.65,715.0,719.0,0.0,8.0,0.0,25092.0,83.7,0.0,0.0,16558.1,66703.0,1,4,-1,8,22,0,1,12
1,7000.0,15.27,247.11,155500.0,1.0,670.0,674.0,1.0,11.0,0.0,98455.0,9.4,0.0,0.0,4377.82,57119.0,1,3,-1,10,16,0,1,0
3,16000.0,13.49,547.08,115000.0,20.0,680.0,684.0,1.0,11.0,2.0,20593.0,21.0,0.0,0.0,12040.1,5257.0,1,2,-1,3,11,1,1,6
5,6000.0,7.9,202.49,32884.8,11.2,675.0,679.0,0.0,7.0,0.0,6465.0,28.3,0.0,0.0,4916.63,1482.0,1,1,-1,2,8,1,0,6
6,13000.0,15.96,402.66,80000.0,23.51,685.0,689.0,0.0,7.0,0.0,14131.0,85.6,0.0,0.0,8363.87,2366.0,1,0,-1,0,2,1,0,6


In [44]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_norm), columns=df_norm.columns)

In [46]:
df_normalized.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,fico_range_low,fico_range_high,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,recoveries,collection_recovery_fee,last_pymnt_amnt,avg_cur_bal,loan_status_encoded,grade_encoded,term_encoded,emp_length_encoded,sub_grade_encoded,home_ownership_encoded,verification_status_encoded,purpose_encoded
0,0.485897,0.272822,0.292218,0.019787,0.921974,0.297297,0.295699,0.0,0.127273,0.0,0.031295,0.837838,0.0,0.0,0.405605,0.187461,1.0,0.666667,0.0,0.8,0.647059,0.0,0.5,1.0
1,0.153846,0.513824,0.128586,0.027727,0.033344,0.054054,0.053763,0.166667,0.181818,0.0,0.122795,0.094094,0.0,0.0,0.107238,0.160526,1.0,0.5,0.0,1.0,0.470588,0.0,0.5,0.0
2,0.384615,0.42097,0.306345,0.020501,0.666889,0.108108,0.107527,0.166667,0.181818,0.105263,0.025684,0.21021,0.0,0.0,0.294933,0.014774,1.0,0.333333,0.0,0.3,0.323529,0.333333,0.5,0.5
3,0.128205,0.129369,0.102145,0.005849,0.373458,0.081081,0.080645,0.0,0.109091,0.0,0.008063,0.283283,0.0,0.0,0.120437,0.004165,1.0,0.166667,0.0,0.2,0.235294,0.333333,0.0,0.5
4,0.307692,0.549817,0.220763,0.014256,0.783928,0.135135,0.134409,0.0,0.109091,0.0,0.017624,0.856857,0.0,0.0,0.20488,0.006649,1.0,0.0,0.0,0.0,0.058824,0.333333,0.0,0.5


In [6]:
X = df_sampled.drop(columns=['loan_status'])
y = df_sampled['loan_status']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=10)

rfe.fit(X_train, y_train)
selected_features = X_train.columns[rfe.support_].tolist()

In [7]:
 selected_features = ['loan_amnt',
 'int_rate',
 'installment',
 'grade',
 'annual_inc',
 'dti',
 'revol_bal',
 'revol_util',
 'last_pymnt_amnt',
 'avg_cur_bal'
 ]

### Reference
- https://www.sciencedirect.com/science/article/pii/S2667305325000407
- Explainable AI based LightGBM prediction model to predict default borrower in social lending platform, Intelligent Systems with Applications
Volume 26, June 2025, 200514