This notebook contains the final model selected after a detailed exploratory data analysis and testing 
of multiple machine learning methods as documented in [EDA.ipynb](./EDA.ipynb). The chosen model is 
based on performance metrics and optimization techniques.


# Load Data

In [1]:
import pandas as pd
loan_df = pd.read_csv("Loan.csv")
loan_df

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.000000,0.724972,11,126928,0.199652,0.227590,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.000000,0.896155,5,99452,0.300398,0.300911,1047.506980,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.175990,330.179140,0.070210,1,36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2072-09-29,44,30180,587,Employed,High School,19,24521,36,Married,...,2515.000000,0.826217,1,55327,0.216021,0.195574,905.767712,0.627741,0,55.0
19996,2072-09-30,56,49246,567,Employed,Associate,33,25818,36,Married,...,4103.833333,0.816618,3,64002,0.227318,0.199168,958.395633,0.334418,0,54.0
19997,2072-10-01,44,48958,645,Employed,Bachelor,20,37033,72,Married,...,4079.833333,0.887216,3,103663,0.229533,0.226766,945.427454,0.357227,0,45.0
19998,2072-10-02,60,41025,560,Employed,High School,36,14760,72,Married,...,3418.750000,0.843787,5,10600,0.249760,0.264873,411.168284,0.408678,0,59.0


# Selecting relevent coloumns based on EDA

In [2]:
df = loan_df.copy()
selected_columns = [
    'Age', 'CreditScore', 'EmploymentStatus', 'EducationLevel',
    'LoanAmount', 'LoanDuration', 'CreditCardUtilizationRate',
    'BankruptcyHistory', 'PreviousLoanDefaults', 'LengthOfCreditHistory',
    'MonthlyIncome', 'NetWorth', 'InterestRate', 'RiskScore','LoanApproved'
]

df = df[selected_columns]
df.sample(10)

Unnamed: 0,Age,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,CreditCardUtilizationRate,BankruptcyHistory,PreviousLoanDefaults,LengthOfCreditHistory,MonthlyIncome,NetWorth,InterestRate,RiskScore,LoanApproved
18102,56,607,Employed,Associate,9710,36,0.483539,0,0,1,2662.083333,195818,0.187093,49.0,0
15974,40,549,Employed,High School,37711,24,0.124114,0,0,20,3323.666667,57127,0.198726,56.0,0
6836,51,593,Employed,Associate,21460,12,0.238972,0,0,11,5839.0,285284,0.188128,41.6,1
2342,45,580,Employed,Master,25845,72,0.179233,0,0,4,1558.333333,100676,0.243054,56.0,0
1389,30,584,Employed,Associate,24818,12,0.223024,0,0,14,2499.25,78114,0.21865,50.0,0
10996,43,519,Self-Employed,Bachelor,16894,60,0.075662,0,0,29,1535.333333,1197,0.266175,54.0,0
19478,36,588,Employed,Bachelor,81554,60,0.16337,0,1,9,6421.583333,1186,0.32329,56.0,0
15089,18,477,Employed,Doctorate,27362,24,0.342972,0,0,2,4298.166667,47941,0.222863,56.0,0
9905,24,562,Employed,Associate,19192,48,0.362235,0,0,19,3715.75,108759,0.243424,47.0,0
4916,52,594,Employed,Associate,25007,84,0.107881,0,0,20,2200.083333,77279,0.245751,49.0,0


# Ordinal Encoding

In [3]:
df['EmploymentStatus'] = df['EmploymentStatus'].map({'Unemployed': 0,'Self-Employed':1,'Employed': 2})
df['EducationLevel'] = df['EducationLevel'].map({'High School': 0, 'Associate':1,'Bachelor': 2,'Master':3, 'Doctorate': 4})

df

Unnamed: 0,Age,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,CreditCardUtilizationRate,BankruptcyHistory,PreviousLoanDefaults,LengthOfCreditHistory,MonthlyIncome,NetWorth,InterestRate,RiskScore,LoanApproved
0,45,617,2,3,13152,48,0.354418,0,0,9,3329.000000,126928,0.227590,49.0,0
1,38,628,2,1,26045,48,0.087827,0,0,9,3309.083333,43609,0.201077,52.0,0
2,47,570,2,2,17627,36,0.137414,0,0,22,3393.666667,5205,0.212548,52.0,0
3,58,545,2,0,37898,96,0.267587,0,0,10,5757.000000,99452,0.300911,54.0,0
4,37,594,2,1,9184,36,0.320535,0,0,27,8605.333333,227019,0.175990,36.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,44,587,2,0,24521,36,0.101288,0,0,7,2515.000000,55327,0.195574,55.0,0
19996,56,567,2,1,25818,36,0.471818,0,0,28,4103.833333,64002,0.199168,54.0,0
19997,44,645,2,2,37033,72,0.216596,0,0,8,4079.833333,103663,0.226766,45.0,0
19998,60,560,2,0,14760,72,0.364758,0,0,13,3418.750000,10600,0.264873,59.0,0


# Handling Skewness using log

In [4]:
# Calculate skewness for each column
skewness = df.skew()

# Sort the skewness values in ascending order
sorted_skewness = skewness.sort_values(ascending=True)

# Display the sorted skewness values
print(sorted_skewness)

EmploymentStatus            -2.455244
CreditScore                 -0.596727
RiskScore                   -0.162062
LengthOfCreditHistory        0.002989
Age                          0.185964
EducationLevel               0.273155
InterestRate                 0.485491
CreditCardUtilizationRate    0.601131
LoanDuration                 0.651330
LoanApproved                 1.224086
LoanAmount                   1.833688
MonthlyIncome                1.992929
PreviousLoanDefaults         2.665941
BankruptcyHistory            4.017672
NetWorth                     5.599844
dtype: float64


In [5]:
import numpy as np

# List of features to transform (Continues numeric coloumns with more than 1 skewness value)
features_to_log1p = ['LoanAmount', 'MonthlyIncome', 'NetWorth']

# Apply log1p transformation
df[features_to_log1p] = df[features_to_log1p].apply(np.log1p)
df

Unnamed: 0,Age,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,CreditCardUtilizationRate,BankruptcyHistory,PreviousLoanDefaults,LengthOfCreditHistory,MonthlyIncome,NetWorth,InterestRate,RiskScore,LoanApproved
0,45,617,2,3,9.484405,48,0.354418,0,0,9,8.110728,11.751383,0.227590,49.0,0
1,38,628,2,1,10.167619,48,0.087827,0,0,9,8.104729,10.683042,0.201077,52.0,0
2,47,570,2,2,9.777244,36,0.137414,0,0,22,8.129961,8.557567,0.212548,52.0,0
3,58,545,2,0,10.542680,96,0.267587,0,0,10,8.658345,11.507440,0.300911,54.0,0
4,37,594,2,1,9.125327,36,0.320535,0,0,27,9.060254,12.332793,0.175990,36.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,44,587,2,0,10.107326,36,0.101288,0,0,7,7.830426,10.921034,0.195574,55.0,0
19996,56,567,2,1,10.158866,36,0.471818,0,0,28,8.319920,11.066685,0.199168,54.0,0
19997,44,645,2,2,10.519592,72,0.216596,0,0,8,8.314056,11.548910,0.226766,45.0,0
19998,60,560,2,0,9.599744,72,0.364758,0,0,13,8.137323,9.268704,0.264873,59.0,0


# Handling Outliers

In [7]:
# Define the columns to standardize (Continues numeric coloumns)
num_cols = [
    'Age', 'CreditScore', 'LoanAmount', 'LoanDuration',
    'CreditCardUtilizationRate', 'LengthOfCreditHistory',
    'MonthlyIncome', 'NetWorth', 'InterestRate'
]

In [8]:
def remove_outliers_iqr(df, exclude_columns):
    # Loop through numeric columns
    for column in df.select_dtypes(include=['number']).columns:
        if column in exclude_columns:
            continue  # Skip excluded columns

        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        # Define outlier bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter out the outliers
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

    return df

# Remove outliers

df_iqr = remove_outliers_iqr(df, num_cols)

# Print summary
print("Shape before removing outliers:", df.shape)
print("Shape after removing outliers:", df_iqr.shape)
print("Number of rows removed:", df.shape[0] - df_iqr.shape[0])


Shape before removing outliers: (20000, 15)
Shape after removing outliers: (14485, 15)
Number of rows removed: 5515


# Scaling

In [11]:
from sklearn.preprocessing import StandardScaler

# Make a copy of your DataFrame
df_standardized = df.copy()

# Initialize the scaler
scaler = StandardScaler()

# Apply standardization only to selected numeric columns
df_standardized[num_cols] = scaler.fit_transform(df[num_cols])

# df_standardized is now ready, with only selected columns standardized
df_standardized

Unnamed: 0,Age,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,CreditCardUtilizationRate,BankruptcyHistory,PreviousLoanDefaults,LengthOfCreditHistory,MonthlyIncome,NetWorth,InterestRate,RiskScore,LoanApproved
0,0.451489,0.890021,2,3,-1.014008,-0.245578,0.425792,0,0,-0.711630,-0.324366,1.020941,-0.272968,49.0,0
1,-0.150795,1.105724,2,1,0.343015,-0.245578,-1.242602,0,0,-0.711630,-0.334247,0.287206,-0.901165,52.0,0
2,0.623571,-0.031618,2,2,-0.432362,-0.732113,-0.932273,0,0,0.841287,-0.292687,-1.172566,-0.629370,52.0,0
3,1.570017,-0.521852,2,0,1.087972,1.700559,-0.117617,0,0,-0.592175,0.577602,0.853401,1.464331,54.0,0
4,-0.236835,0.439006,2,1,-1.727221,-0.732113,0.213746,0,0,1.438563,1.239575,1.420252,-1.495584,36.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.365449,0.301741,2,0,0.223258,-0.732113,-1.158361,0,0,-0.950540,-0.786044,0.450659,-1.031559,55.0,0
19996,1.397936,-0.090446,2,1,0.325628,-0.732113,1.160510,0,0,1.558018,0.020190,0.550691,-0.946394,54.0,0
19997,0.365449,1.439083,2,2,1.042114,0.727490,-0.436731,0,0,-0.831085,0.010532,0.881882,-0.292475,45.0,0
19998,1.742098,-0.227712,2,0,-0.784919,0.727490,0.490501,0,0,-0.233810,-0.280562,-0.684159,0.610430,59.0,0


# Prediction with best Regressor


In [13]:
from sklearn.model_selection import train_test_split

x=df_standardized.drop(['LoanApproved', 'RiskScore'],axis=1)
y=df_standardized[['RiskScore']]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)


In [14]:
from lightgbm import LGBMRegressor
reg_model = LGBMRegressor()
reg_model.fit(x_train, y_train.values.ravel())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1616
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 13
[LightGBM] [Info] Start training from score 50.843600


In [20]:
from sklearn.metrics import r2_score

y_pred = reg_model.predict(x_test)
print(r2_score(y_pred,y_test))

0.7266635451016251


In [15]:
# to get the order of features trained
x_train.columns

Index(['Age', 'CreditScore', 'EmploymentStatus', 'EducationLevel',
       'LoanAmount', 'LoanDuration', 'CreditCardUtilizationRate',
       'BankruptcyHistory', 'PreviousLoanDefaults', 'LengthOfCreditHistory',
       'MonthlyIncome', 'NetWorth', 'InterestRate'],
      dtype='object')

In [16]:
new_input = {
    'Age': 30,
    'CreditScore': 700,
    'EmploymentStatus': 'Employed',
    'EducationLevel': 'Master',
    'LoanAmount': 15000,
    'LoanDuration': 24,
    'CreditCardUtilizationRate': 0.3,
    'BankruptcyHistory': 0,
    'PreviousLoanDefaults': 0,
    'LengthOfCreditHistory': 5,
    'MonthlyIncome': 4500,
    'NetWorth': 20000,
    'InterestRate': 10.5
}

input_df = pd.DataFrame([new_input])
status_map = {'Unemployed': 0, 'Self-Employed': 1, 'Employed': 2}
edu_map = {'High School': 0, 'Associate': 1, 'Bachelor': 2, 'Master': 3, 'Doctorate': 4}
input_df['EmploymentStatus'] = input_df['EmploymentStatus'].map(status_map)
input_df['EducationLevel'] = input_df['EducationLevel'].map(edu_map)

input_df

Unnamed: 0,Age,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,CreditCardUtilizationRate,BankruptcyHistory,PreviousLoanDefaults,LengthOfCreditHistory,MonthlyIncome,NetWorth,InterestRate
0,30,700,2,3,15000,24,0.3,0,0,5,4500,20000,10.5


In [17]:
# List of features to transform (Continues numeric coloumns with more than 1 skewness value)
features_to_log1p = ['LoanAmount', 'MonthlyIncome', 'NetWorth']

# Apply log1p transformation
input_df[features_to_log1p] = input_df[features_to_log1p].apply(np.log1p)
input_df

Unnamed: 0,Age,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,CreditCardUtilizationRate,BankruptcyHistory,PreviousLoanDefaults,LengthOfCreditHistory,MonthlyIncome,NetWorth,InterestRate
0,30,700,2,3,9.615872,24,0.3,0,0,5,8.412055,9.903538,10.5


In [18]:
# Define the columns to standardize (Continues numeric coloumns)
num_cols_to_standardize = [
    'Age', 'CreditScore', 'LoanAmount', 'LoanDuration',
    'CreditCardUtilizationRate', 'LengthOfCreditHistory',
    'MonthlyIncome', 'NetWorth', 'InterestRate'
]

# Apply standardization only to selected numeric columns
input_df[num_cols_to_standardize] = scaler.transform(input_df[num_cols_to_standardize])

# df_standardized is now ready, with only selected columns standardized
input_df

Unnamed: 0,Age,CreditScore,EmploymentStatus,EducationLevel,LoanAmount,LoanDuration,CreditCardUtilizationRate,BankruptcyHistory,PreviousLoanDefaults,LengthOfCreditHistory,MonthlyIncome,NetWorth,InterestRate
0,-0.839119,2.517597,2,3,-0.752884,-1.218647,0.085231,0,0,-1.189451,0.171943,-0.248156,243.125239


In [19]:
Predicted_Risk_Score = reg_model.predict(input_df)
Predicted_Risk_Score

array([49.28164676])

# Prediction with best Classifier

In [23]:
from sklearn.model_selection import train_test_split

x=df_standardized[['RiskScore']]
y=df_standardized[['LoanApproved']]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)


In [24]:
from sklearn.ensemble import RandomForestClassifier

clf_model = RandomForestClassifier()
clf_model.fit(x_train, y_train.values.ravel())

In [None]:
Predicted_Risk_Score

array([50.1704826])

In [26]:
Predicted_LoanApproval_Status = clf_model.predict([Predicted_Risk_Score])
if Predicted_LoanApproval_Status == 0:
  print("Rejected")
else:
  print("Approved")

Rejected




# Save the model and scaler

In [27]:
import pickle

# Save regression model
with open('models/reg_model.pkl', 'wb') as f:
    pickle.dump(reg_model, f)

# Save scaler
with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save classification model
with open('models/clf_model.pkl','wb') as f:
  pickle.dump(clf_model,f)