In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df= pd.read_csv(r'c:\Users\sanju\Desktop\heart.csv')
df.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


***1. Random Forest Classifier***

In [3]:
X= df.iloc[:, 0:-1]
Y = df.iloc[:, -1]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.20, random_state=42 )

In [5]:
print(x_test.shape)
print(x_train.shape)

(61, 13)
(242, 13)


In [6]:
# rf obbject
rf = RandomForestClassifier()  

In [7]:
#fitting the data
rf.fit(x_train, y_train)

In [8]:
# predict 
y_predict = rf.predict(x_test)
accuracy_score(y_test, y_predict) # accuracy score

0.8688524590163934

In [9]:
rf = RandomForestClassifier(max_samples=0.65)  # max_Samples for hyperparameters tuning

In [10]:
rf.fit(x_train, y_train)
rf.predict(x_test)

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [11]:
rf.score(x_test, y_test) # after hyperparameters tuning, accuracy has improved
# 50-75% gives good results. otherwise overfitting/underfitting problem will be there

0.8360655737704918

In [12]:
# cross validation score
# doing this will decrease the above accuracy score
# basically it runs or fit data on the same model k number of times

**Hyperparametes Tuning**

***1. GridSearch CV***
>For finding out important parameters to train the RF model

In [13]:
# Number of trees in random forest
n_estimators = [20,40,60,80,100,120]

# Number of features to consider at every split
max_features = [0.2,0.4,0.6,0.8,1.0]

# Maximum number of levels in tree
max_depth = [2,6,8,None]

# Number of samples
max_samples = [0.35,0.5,0.75,1.0]

# 480 diff random forest will be trained. as for each combination a RF model will be trained

In [14]:
# dictionary
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples
             }
print(param_grid)

{'n_estimators': [20, 40, 60, 80, 100, 120], 'max_features': [0.2, 0.4, 0.6, 0.8, 1.0], 'max_depth': [2, 6, 8, None], 'max_samples': [0.35, 0.5, 0.75, 1.0]}


In [15]:
# rf object
rf =RandomForestClassifier()

In [16]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV( estimator= rf, # which model or ML object to train
                       param_grid= param_grid,
                       cv = 8, # how many times CV will be run or how many times each RF model will be trained
                       verbose=2, 
                       n_jobs=-1 # all cores
                       )                     

In [17]:
rf_grid.fit(x_train, y_train)

Fitting 8 folds for each of 480 candidates, totalling 3840 fits


In [18]:
# best params
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 80}

In [19]:
# best score
rf_grid.best_score_

0.8469086021505376

***2. RandomSearchCV***
> In case of large dataset or trying large number of hyperparameters , `GridSearchCV` becomes slow. 

> In such case, `RandomSearchCV` randomly picks 15-20 parameters out of 480 and use them to train. 

In [20]:
# Number of trees in random forest
n_estimators = [20,40,60,80,100,120]

# Number of features to consider at every split
max_features = [0.2,0.4,0.6,0.8,1.0]

# Maximum number of levels in tree
max_depth = [2,6,8,None]

# Number of samples
max_samples = [0.35,0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2,5,7,9]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [21]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'bootstrap':bootstrap,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 40, 60, 80, 100, 120], 'max_features': [0.2, 0.4, 0.6, 0.8, 1.0], 'max_depth': [2, 6, 8, None], 'max_samples': [0.35, 0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5, 7, 9], 'min_samples_leaf': [1, 2]}


In [22]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator = rf, 
                       param_distributions = param_grid, 
                       cv = 8, 
                       verbose=2, 
                       n_jobs = -1)

In [23]:
rf_grid.fit(x_train, y_train) # picked 10 parameters all total

Fitting 8 folds for each of 10 candidates, totalling 80 fits


32 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sanju\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sanju\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sanju\AppData\Lo

In [25]:
rf_grid.best_params_

{'n_estimators': 60,
 'min_samples_split': 9,
 'min_samples_leaf': 2,
 'max_samples': 1.0,
 'max_features': 0.2,
 'max_depth': 6,
 'bootstrap': True}

In [26]:
rf_grid.best_score_ # this doesn't give good result bt gives faster result

0.817741935483871