In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


In [2]:

df = pd.read_csv("loan_data.csv")



##### Data preprocessing

In [3]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
print(df.columns)

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [5]:

df.drop(columns=['loan_id', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value'], inplace=True)


In [6]:

categorical_columns = [' education', ' self_employed', ' loan_status']
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].median())


In [7]:

# Encoding categorical variables
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])


##### Training dataset

In [8]:
X = df.drop(columns=[' loan_status'])
Y = df[' loan_status']

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [10]:
RFmodel = RandomForestClassifier(random_state=42)
RFmodel.fit(X_train, Y_train)

In [11]:
Y_pred_RFmodel = RFmodel.predict(X_test)

In [12]:

RFmodel_accuracy = accuracy_score(Y_test, Y_pred_RFmodel)
print("RFmodel Accuracy:", RFmodel_accuracy)
print("Classification Report:\n", classification_report(Y_test, Y_pred_RFmodel))


RFmodel Accuracy: 0.9742388758782201
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       536
           1       0.97      0.96      0.97       318

    accuracy                           0.97       854
   macro avg       0.97      0.97      0.97       854
weighted avg       0.97      0.97      0.97       854



######  Hyperparameter tuning using GridSearchCV

In [13]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [14]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

In [15]:
# Best parameters & evaluation
y_pred_grid = grid_search.best_estimator_.predict(X_test)
grid_accuracy = accuracy_score(Y_test, y_pred_grid)
print("Best Hyperparameters (GridSearchCV):", grid_search.best_params_)
print("GridSearchCV Accuracy:", grid_accuracy)


Best Hyperparameters (GridSearchCV): {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
GridSearchCV Accuracy: 0.9707259953161592


###### Hyperparameter tuning using RandomizedSearchCV

In [16]:
param_dist = {
    'n_estimators': np.arange(50, 200, 10),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [17]:

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, 
                                   n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, Y_train)

In [18]:
y_pred_random = random_search.best_estimator_.predict(X_test)
random_accuracy = accuracy_score(Y_test, y_pred_random)
print("Best Hyperparameters (RandomizedSearchCV):", random_search.best_params_)
print("RandomizedSearchCV Accuracy:", random_accuracy)

Best Hyperparameters (RandomizedSearchCV): {'n_estimators': 170, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}
RandomizedSearchCV Accuracy: 0.9742388758782201


###### Comparing both

In [19]:
print("RFmodel Accuracy:", RFmodel_accuracy)
print("GridSearchCV Accuracy:", grid_accuracy)
print("RandomizedSearchCV Accuracy:", random_accuracy)

RFmodel Accuracy: 0.9742388758782201
GridSearchCV Accuracy: 0.9707259953161592
RandomizedSearchCV Accuracy: 0.9742388758782201
