In [1]:
# pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn

In [2]:
#  Import Libraries
import pandas as pd
import numpy as np


In [3]:
#  Scikit-learn modules
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [4]:
#  Load Dataset
df = pd.read_csv('Loan Default Prediction Dataset.csv')

In [5]:
#  Basic Information
print(" Dataset Shape:", df.shape)
print("\n First 5 Rows:")
print(df.head())


 Dataset Shape: (255347, 18)

 First 5 Rows:
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes     

In [6]:
print("\n Dataset Info:")
print(df.info())


 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   LoanID          255347 non-null  object 
 1   Age             255347 non-null  int64  
 2   Income          255347 non-null  int64  
 3   LoanAmount      255347 non-null  int64  
 4   CreditScore     255347 non-null  int64  
 5   MonthsEmployed  255347 non-null  int64  
 6   NumCreditLines  255347 non-null  int64  
 7   InterestRate    255347 non-null  float64
 8   LoanTerm        255347 non-null  int64  
 9   DTIRatio        255347 non-null  float64
 10  Education       255347 non-null  object 
 11  EmploymentType  255347 non-null  object 
 12  MaritalStatus   255347 non-null  object 
 13  HasMortgage     255347 non-null  object 
 14  HasDependents   255347 non-null  object 
 15  LoanPurpose     255347 non-null  object 
 16  HasCoSigner     255347 non-null  object 

In [7]:
print("\n Missing Values:")
print(df.isnull().sum())


 Missing Values:
LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64


In [8]:
print("\n Data Description:")
print(df.describe(include='all'))


 Data Description:
            LoanID            Age         Income     LoanAmount  \
count       255347  255347.000000  255347.000000  255347.000000   
unique      255347            NaN            NaN            NaN   
top     ZTH91CGL0B            NaN            NaN            NaN   
freq             1            NaN            NaN            NaN   
mean           NaN      43.498306   82499.304597  127578.865512   
std            NaN      14.990258   38963.013729   70840.706142   
min            NaN      18.000000   15000.000000    5000.000000   
25%            NaN      31.000000   48825.500000   66156.000000   
50%            NaN      43.000000   82466.000000  127556.000000   
75%            NaN      56.000000  116219.000000  188985.000000   
max            NaN      69.000000  149999.000000  249999.000000   

          CreditScore  MonthsEmployed  NumCreditLines   InterestRate  \
count   255347.000000   255347.000000   255347.000000  255347.000000   
unique            NaN          

In [9]:
 #Drop non-informative column
if "LoanID" in df.columns:
    df.drop("LoanID", axis=1, inplace=True)

In [10]:
# Encode Categorical Variables
categorical_cols = [
    'Education', 'EmploymentType', 'MaritalStatus',
    'LoanPurpose', 'HasMortgage', 'HasDependents', 'HasCoSigner'
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoder for future use

In [11]:
#  Handle Missing Values (drop or impute)
print("\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64


In [12]:
# Imputation or dropping
df.dropna(inplace=True)

In [13]:
# Scale Numerical Features
numeric_cols = [
    'Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
    'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio'
]

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [14]:
#  Show cleaned data
print("\n Cleaned and Preprocessed Data Sample:")
print(df.head())


 Cleaned and Preprocessed Data Sample:
        Age    Income  LoanAmount  CreditScore  MonthsEmployed  \
0  0.833990  0.089693   -1.086833    -0.341492        0.590533   
1  1.701221 -0.823021   -0.044309    -0.731666       -1.285731   
2  0.166888  0.043854    0.022715    -0.775718       -0.968209   
3 -0.767053 -1.303452   -1.168538     1.061875       -1.718715   
4  1.100830 -1.592855   -1.671921     0.369631       -1.487790   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio  Education  \
0        1.341937      0.261771 -0.001526 -0.260753          0   
1       -1.343791     -1.308350  1.412793  0.778585          2   
2        0.446694      1.156831 -0.708685 -0.823728          2   
3        0.446694     -0.967805 -0.708685 -1.170174          1   
4        1.341937     -1.052188  0.705634  0.995114          0   

   EmploymentType  MaritalStatus  HasMortgage  HasDependents  LoanPurpose  \
0               0              0            1              1            4   
1           