In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV,GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer

In [None]:
data = load_breast_cancer()
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [None]:
x = data.data
y = data.target

In [None]:
x.shape

(569, 30)

In [None]:
y.shape

(569,)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0,test_size=0.2)

In [None]:
x_train.shape

(455, 30)

In [None]:
y_train.shape

(455,)

In [None]:
x_test.shape

(114, 30)

In [None]:
lr = LogisticRegression()
gb = GradientBoostingClassifier()
rf = RandomForestClassifier()
svm = SVC()

In [None]:
# Random Forest
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.956140350877193

In [None]:
# GradientBoostingClassifier
gb.fit(x_train,y_train)
y_pred = gb.predict(x_test)
accuracy_score(y_test,y_pred)

0.9736842105263158

In [None]:
# GradientBoostingClassifier
svm.fit(x_train,y_train)
y_pred = svm.predict(x_test)
accuracy_score(y_test,y_pred)

0.9298245614035088

In [None]:
cvs = cross_val_score(RandomForestClassifier(max_samples=0.75),x,y,cv=10,scoring = 'accuracy')
cvs

array([0.98245614, 0.89473684, 0.9122807 , 0.96491228, 1.        ,
       0.98245614, 0.9122807 , 0.98245614, 0.94736842, 0.98214286])

In [None]:
np.mean(cvs)

0.9561090225563911

**GridSeachCV**


In [None]:
# number of trees in random forest
n_estimators = [20,60,100,120]

# number of features to consider at every split
max_features = [0.2,0.6,1.0]

# maximum number of labels in tree
max_depth =[2,8,None]

# number of samples
max_samples = [0.5,0.75,1.0]


In [None]:

param_grid = {'n_estimators' : n_estimators,
              'max_features' : max_features,
              'max_depth' : max_depth,
              'max_samples' : max_samples} 
print(param_grid)              

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [None]:
rf_grid = GridSearchCV(estimator = rf,
                       param_grid = param_grid,
                       cv =5,
                       verbose=2,
                       n_jobs=-1)

In [None]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [2, 8, None],
                         'max_features': [0.2, 0.6, 1.0],
                         'max_samples': [0.5, 0.75, 1.0],
                         'n_estimators': [20, 60, 100, 120]},
             verbose=2)

In [None]:
rf_grid.best_params_

{'max_depth': 8, 'max_features': 0.6, 'max_samples': 0.75, 'n_estimators': 120}

In [None]:
rf_grid.best_score_

0.9604395604395604

**RandomizedSeachCV**

In [None]:
# number of trees in random forest
n_estimators = [20,60,100,120]

# number of features to consider at every split
max_features = [0.2,0.6,1.0]

# maximum number of labels in tree
max_depth =[2,8,None]

# number of samples
max_samples = [0.5,0.75,1.0]

# bootstrap samples
bootstrap =[True,False]

# minimum number of samples required to split the node
min_samples_split = [2,5]

# minimum number of samples required to each leaf node
min_samples_leaf = [1,2]


In [None]:
param_grid = {'n_estimators' : n_estimators,
              'max_features' : max_features,
              'max_depth' : max_depth,
              'max_samples' : max_samples,
              'bootstrap' : bootstrap,
              'min_samples_split' : min_samples_split,
              'min_samples_leaf' : min_samples_leaf}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [None]:
rf_grid = RandomizedSearchCV(estimator=rf,
                             param_distributions=param_grid,
                             random_state=42,
                             cv=5,
                             verbose=2,
                             n_jobs=-1)

In [None]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 380, in fit
    "`max_sample` cannot be set if `bootstrap=False`. "
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

        nan 0.94945055        nan        nan]


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 8, None],
                                        'max_features': [0.2, 0.6, 1.0],
                                        'max_samples': [0.5, 0.75, 1.0],
                                        'min_samples_leaf': [1, 2],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [20, 60, 100, 120]},
                   random_state=42, verbose=2)

In [None]:
rf_grid.best_params_

{'n_estimators': 60,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_samples': 0.75,
 'max_features': 0.2,
 'max_depth': None,
 'bootstrap': True}

In [None]:
rf_grid.best_score_

0.9538461538461538