In [12]:
## ======================================================================
#            Importing the necessary modules and tools
## ======================================================================

import pandas as pd; import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# from sklearn.metrics import necessary metrics
from sklearn.metrics import accuracy_score


# Set notebook options
# --------------------
pd.options.display.float_format = '{:,.3f}'.format
%matplotlib inline 

In [13]:
url = 'https://raw.githubusercontent.com/DrSaadLa/PythonTuts/main/TreeBasedModels/loan_data.csv'
df = pd.read_csv(url)

In [14]:
# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
df['purpose']=LabelEncoder().fit_transform(df['purpose'])

In [15]:
# Features
X = df.drop('credit.policy', axis = 1)
# Target
y = df['credit.policy']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3 , random_state= 1,stratify= y)

In [17]:

from sklearn.ensemble import RandomForestClassifier
# Instantiate a random forests classifier 'rf_clf'
rf_clf = RandomForestClassifier(random_state= 10)

# Basic the necessary tools
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score

# Import the Search Algorith GridSearchCV and RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Define a grid of hyperparameter 'params_rf_reg' (the same for classifier)
params_rf_clf = {
              'n_estimators': [300, 400, 500],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [0.1, 0.2],
              'max_features': ['log2', 'sqrt']
}


# Instantiate 'grid_rf_clf'
grid_rf_clf= GridSearchCV(estimator=rf_clf,
                       param_grid=params_rf_clf,
                       cv=5,
                       scoring='neg_mean_squared_error',
                       verbose=1,
                       n_jobs=-1)


 # Fit 'grid_rf_reg' to the training set
grid_rf_clf.fit(X_train, y_train)

 # Extract the best hyperparameters from 'grid_rf'
best_hyperparams = grid_rf_clf.best_params_
print('Best hyperparameters:\n', best_hyperparams)

 # Extract the best model from 'grid_rf'
best_model = grid_rf_clf.best_estimator_

# Predict the test set labels
y_pred = best_model.predict(X_test)

# Evaluate the test set RMSE
rmse_test = np.sqrt(MSE(y_test, y_pred))

# Print the test set RMSE
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best hyperparameters:
 {'max_depth': 4, 'max_features': 'log2', 'min_samples_leaf': 0.1, 'n_estimators': 300}
Test set RMSE of rf: 0.44


In [18]:
# Evaluate test set accuracy
# --------------------------
test_acc = accuracy_score(y_pred, y_test)

In [19]:
print('Test set accuracy: {:.3f}'.format(test_acc))

Test set accuracy: 0.805
