In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('../../../DATASET/heart.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [5]:
df.shape

(297, 14)

In [17]:
x=df.iloc[:,0:-1]
y=df.iloc[:,-1]

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [19]:
print(x_train.shape)
print(x_test.shape)

(237, 13)
(60, 13)


In [20]:
rf = RandomForestClassifier()

In [21]:
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.7666666666666667

In [22]:
# Lets tune hyper-parameters
rf = RandomForestClassifier(max_samples=0.75,random_state=42)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.7333333333333333

In [23]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(),x,y,cv=10,scoring='accuracy'))

np.float64(0.7910344827586206)

### GridSearchCV

In [None]:
# Numbers of tress in random forest 
n_estimators = [20,60,100,120]

# Numbers of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum numbers of levels in tree
max_depth = [2,8,None] 

# Number of samples
max_samples = [0.5,0.75,1.0]

# we'll train 108 different random forest

In [25]:
param_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'max_samples':max_samples
}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [26]:
rf = RandomForestClassifier()

In [27]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator=rf,
                       param_grid=param_grid,
                       cv=5,
                       verbose=2,
                       n_jobs=-1)

In [28]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 120}

In [30]:
rf_grid.best_score_

np.float64(0.8773936170212766)

### RandomSearchCV

In [None]:
# GridSearchCV falls slow for larger dataset 
# so RandomSearchCV is used which randomly choose options from total combination(like 108 combination in gridsearchcv)

In [37]:
# Numbers of tress in random forest 
n_estimators = [20,60,100,120]

# Numbers of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum numbers of levels in tree
max_depth = [2,8,None] 

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2,5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2]

In [38]:
param_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'max_samples':max_samples,
    'bootstrap':bootstrap,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf
}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [39]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator=rf,
                       param_distributions=param_grid,
                       cv=5,
                       verbose=2,
                       n_jobs=-1)

In [40]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\godsc\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\godsc\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\godsc\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_forest.py", line 431, in fit
    raise ValueError(
    ...<3 lines

In [41]:
rf_grid.best_params_

{'n_estimators': 20,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_samples': 0.75,
 'max_features': 0.2,
 'max_depth': 8,
 'bootstrap': True}

In [42]:
rf_grid.best_score_

np.float64(0.8603723404255319)

In [43]:
# Randomsearchcv did not give best results but get faster results
# why too use it ?
# because working with big dataset it gives result faster with nearr to good result