In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
file_path = 'loan_data.csv'  
loan_data = pd.read_csv(file_path)
loan_data.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [3]:
loan_data.info()
loan_data.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
loan_data['Gender'].fillna(loan_data['Gender'].mode()[0], inplace=True)
loan_data['Dependents'].fillna(loan_data['Dependents'].mode()[0], inplace=True)
loan_data['Self_Employed'].fillna(loan_data['Self_Employed'].mode()[0], inplace=True)
loan_data['Loan_Amount_Term'].fillna(loan_data['Loan_Amount_Term'].mode()[0], inplace=True)
loan_data['Credit_History'].fillna(loan_data['Credit_History'].mode()[0], inplace=True)
loan_data.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan_data['Gender'].fillna(loan_data['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan_data['Dependents'].fillna(loan_data['Dependents'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work beca

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [5]:
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
loan_data_encoded = pd.get_dummies(loan_data, columns=categorical_cols, drop_first=True)
loan_data_encoded.head()


Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,LP001003,4583,1508.0,128.0,360.0,1.0,True,True,True,False,False,False,False,False,False,False
1,LP001005,3000,0.0,66.0,360.0,1.0,True,True,False,False,False,False,True,False,True,True
2,LP001006,2583,2358.0,120.0,360.0,1.0,True,True,False,False,False,True,False,False,True,True
3,LP001008,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,False,False,False,True,True
4,LP001013,2333,1516.0,95.0,360.0,1.0,True,True,False,False,False,True,False,False,True,True


In [6]:
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
scaler = StandardScaler()
loan_data_encoded[numerical_cols] = scaler.fit_transform(loan_data_encoded[numerical_cols])
loan_data_encoded.head()


Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,LP001003,0.707469,0.098695,0.812575,0.275147,0.419435,True,True,True,False,False,False,False,False,False,False
1,LP001005,-0.408932,-0.546371,-1.376596,0.275147,0.419435,True,True,False,False,False,False,True,False,True,True
2,LP001006,-0.703019,0.462294,0.530102,0.275147,0.419435,True,True,False,False,False,True,False,False,True,True
3,LP001008,1.706799,-0.546371,1.271595,0.275147,0.419435,True,False,False,False,False,False,False,False,True,True
4,LP001013,-0.87933,0.102118,-0.352629,0.275147,0.419435,True,True,False,False,False,True,False,False,True,True


In [7]:
X = loan_data_encoded.drop(columns=['Loan_ID', 'Loan_Status_Y']) 
y = loan_data_encoded['Loan_Status_Y']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((304, 14), (77, 14), (304,), (77,))

In [8]:
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)
model


In [9]:
y_pred = model.predict(X_test)
y_pred[:10]  


array([ True, False,  True,  True,  True, False,  True,  True, False,
        True])

In [10]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report_output)


Model Accuracy: 0.8831168831168831

Confusion Matrix:
 [[13  9]
 [ 0 55]]

Classification Report:
               precision    recall  f1-score   support

       False       1.00      0.59      0.74        22
        True       0.86      1.00      0.92        55

    accuracy                           0.88        77
   macro avg       0.93      0.80      0.83        77
weighted avg       0.90      0.88      0.87        77



In [11]:
import joblib
joblib.dump(model, 'loan_status_model.pkl')

['loan_status_model.pkl']