### All Techniques Of Hyper Parameter Optimization

- **GridSearchCV**
- **RandomizedSearchCV**
- **Bayesian Optimization** - Automate Hyperparameter Tuning (Hyperopt)
- **Sequential Model Based Optimization**(Tuning a scikit-learn estimator with skopt)
- **Optuna** - Automate Hyperparameter Tuning
- **Genetic Algorithms** (TPOT Classifier)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df["Glucose"] = np.where(df["Glucose"]==0,df["Glucose"].median(),df["Glucose"])
df["Insulin"] = np.where(df["Insulin"]==0,df["Insulin"].median(),df["Insulin"])
df["SkinThickness"] = np.where(df["SkinThickness"]==0,df["SkinThickness"].median(),df["SkinThickness"])

In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.65625,69.105469,27.334635,94.652344,31.992578,0.471876,33.240885,0.348958
std,3.369578,30.438286,19.355807,9.229014,105.547598,7.88416,0.331329,11.760232,0.476951
min,0.0,44.0,0.0,7.0,14.0,0.0,0.078,21.0,0.0
25%,1.0,99.75,62.0,23.0,30.5,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,31.25,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [9]:
# Independent and dependent features
X = df.drop("Outcome",axis=1)
y = df["Outcome"]

In [10]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [11]:
# train,test and split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [12]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [13]:
print("Accuracy_score:\n",accuracy_score(y_test,y_pred))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("classification report:\n",classification_report(y_test,y_pred))

Accuracy_score:
 0.8116883116883117
confusion matrix:
 [[99  8]
 [21 26]]
classification report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87       107
           1       0.76      0.55      0.64        47

    accuracy                           0.81       154
   macro avg       0.79      0.74      0.76       154
weighted avg       0.81      0.81      0.80       154



The main parameters used by a Random Forest Classifier are:

- **criterion** = the function used to evaluate the quality of a split.
- **max_depth** = maximum number of levels allowed in each tree.
- **max_features** = maximum number of features considered when splitting a node.
- **min_samples_leaf** = minimum number of samples which can be stored in a tree leaf.
- **min_samples_split** = minimum number of samples necessary in a node to cause node splitting.
- **n_estimators** = number of trees in the ensamble.

In [14]:
### Manual HyperParameter Tuning
model = RandomForestClassifier(n_estimators=300,criterion="entropy",
                              max_features="sqrt",min_samples_leaf=10,random_state=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [15]:
print("Accuracy_score:\n",accuracy_score(y_test,y_pred))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("classification report:\n",classification_report(y_test,y_pred))

Accuracy_score:
 0.8246753246753247
confusion matrix:
 [[97 10]
 [17 30]]
classification report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88       107
           1       0.75      0.64      0.69        47

    accuracy                           0.82       154
   macro avg       0.80      0.77      0.78       154
weighted avg       0.82      0.82      0.82       154



In [16]:
### Manual HyperParameter Tuning
model = RandomForestClassifier(n_estimators=500,criterion="gini",
                              max_features="sqrt",min_samples_leaf=10,random_state=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [17]:
print("Accuracy_score:\n",accuracy_score(y_test,y_pred))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("classification report:\n",classification_report(y_test,y_pred))

Accuracy_score:
 0.8116883116883117
confusion matrix:
 [[97 10]
 [19 28]]
classification report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87       107
           1       0.74      0.60      0.66        47

    accuracy                           0.81       154
   macro avg       0.79      0.75      0.76       154
weighted avg       0.81      0.81      0.81       154



We need to use randomized Search CV first and then use Grid Search CV

### Randomized Search CV

In [18]:
from sklearn.model_selection import RandomizedSearchCV

In [19]:
params = {
 # Number of trees in random forest
"n_estimators": [int(x) for x in np.linspace(200,2000,10)],
 # Number of features to consider at every split
"max_features" : ["auto","sqrt","log2"],
 # Maximum number of levels in tree
"max_depth" : [int(x) for x in np.linspace(10,1000,10)],
 # Minimum number of samples required to split a node
"min_samples_split": [1,3,4,5,7,9],
 # Minimum number of samples required at each leaf node
"min_samples_leaf" : [1,2,4,6,8],
"criterion":["entropy","gini"]
    }

In [20]:
print(params)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [21]:
rf = RandomForestClassifier()
rand = RandomizedSearchCV(rf,params,n_iter=100,cv=3,random_state=100,verbose=2)
rand.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END criterion=entropy, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END criterion=entropy, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END criterion=entropy, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END criterion=gini, max_depth=560, max_features=log2, min_samples_leaf=1, min_samples_split=7, n_estimators=1600; total time=   4.2s
[CV] END criterion=gini, max_depth=560, max_features=log2, min_samples_leaf=1, min_samples_split=7, n_estimators=1600; total time=   3.7s
[CV] END criterion=gini, max_depth=560, max_features=log2, min_samples_leaf=1, min_samples_split=7, n_estimators=1600; total time=   3.7s
[CV] END criterion=gini, max_depth=340, max_features=sqrt, min_samples_leaf=2, min_samples_split=9, n_e

[CV] END criterion=entropy, max_depth=670, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=2000; total time=   1.2s
[CV] END criterion=gini, max_depth=780, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.8s
[CV] END criterion=gini, max_depth=780, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.9s
[CV] END criterion=gini, max_depth=780, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.8s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=6, min_samples_split=9, n_estimators=200; total time=   0.3s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=6, min_samples_split=9, n_estimators=200; total time=   0.5s
[CV] END criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=6, min_samples_split=9, n_estimators=200; total time=   0.7s
[CV] END criterion=entropy, 

[CV] END criterion=entropy, max_depth=450, max_features=sqrt, min_samples_leaf=1, min_samples_split=7, n_estimators=2000; total time=   4.4s
[CV] END criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=4, n_estimators=1400; total time=   2.9s
[CV] END criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=4, n_estimators=1400; total time=   3.0s
[CV] END criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=4, n_estimators=1400; total time=   3.0s
[CV] END criterion=gini, max_depth=120, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=600; total time=   1.1s
[CV] END criterion=gini, max_depth=120, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=600; total time=   1.1s
[CV] END criterion=gini, max_depth=120, max_features=sqrt, min_samples_leaf=6, min_samples_split=5, n_estimators=600; total time=   1.0s
[CV] END criterion=entropy, 

[CV] END criterion=gini, max_depth=560, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=1600; total time=   5.0s
[CV] END criterion=gini, max_depth=560, max_features=auto, min_samples_leaf=1, min_samples_split=4, n_estimators=1600; total time=   5.5s
[CV] END criterion=gini, max_depth=450, max_features=log2, min_samples_leaf=8, min_samples_split=7, n_estimators=1800; total time=   5.8s
[CV] END criterion=gini, max_depth=450, max_features=log2, min_samples_leaf=8, min_samples_split=7, n_estimators=1800; total time=   5.6s
[CV] END criterion=gini, max_depth=450, max_features=log2, min_samples_leaf=8, min_samples_split=7, n_estimators=1800; total time=   5.2s
[CV] END criterion=gini, max_depth=230, max_features=log2, min_samples_leaf=1, min_samples_split=7, n_estimators=1000; total time=   3.3s
[CV] END criterion=gini, max_depth=230, max_features=log2, min_samples_leaf=1, min_samples_split=7, n_estimators=1000; total time=   3.0s
[CV] END criterion=gini, max_depth

[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=4, min_samples_split=7, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=4, min_samples_split=7, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=1000, max_features=log2, min_samples_leaf=4, min_samples_split=7, n_estimators=200; total time=   0.5s
[CV] END criterion=entropy, max_depth=560, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=entropy, max_depth=560, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=entropy, max_depth=560, max_features=sqrt, min_samples_leaf=2, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END criterion=gini, max_depth=670, max_features=log2, min_samples_leaf=8, min_samples_split=7, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_

[CV] END criterion=gini, max_depth=670, max_features=sqrt, min_samples_leaf=8, min_samples_split=3, n_estimators=1600; total time=   3.5s
[CV] END criterion=gini, max_depth=230, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=1600; total time=   4.2s
[CV] END criterion=gini, max_depth=230, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=1600; total time=   4.6s
[CV] END criterion=gini, max_depth=230, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=1600; total time=   4.1s


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [1, 3, 4, 5, 7, 9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [22]:
rand.best_params_

{'n_estimators': 200,
 'min_samples_split': 3,
 'min_samples_leaf': 8,
 'max_features': 'auto',
 'max_depth': 670,
 'criterion': 'gini'}

In [23]:
rand.best_score_

0.7556910569105691

In [24]:
rand.best_estimator_

RandomForestClassifier(max_depth=670, min_samples_leaf=8, min_samples_split=3,
                       n_estimators=200)

In [25]:
y_pred = rand.predict(X_test)
print("Accuracy_score:\n",accuracy_score(y_test,y_pred))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("classification report:\n",classification_report(y_test,y_pred))

Accuracy_score:
 0.8051948051948052
confusion matrix:
 [[97 10]
 [20 27]]
classification report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87       107
           1       0.73      0.57      0.64        47

    accuracy                           0.81       154
   macro avg       0.78      0.74      0.75       154
weighted avg       0.80      0.81      0.80       154



### Grid Search CV

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
param_grid = {
    'criterion': [rand.best_params_['criterion']],
    'max_depth': [rand.best_params_['max_depth']],
    'max_features': [rand.best_params_['max_features']],
    'min_samples_leaf': [rand.best_params_['min_samples_leaf'], 
                         rand.best_params_['min_samples_leaf']+2, 
                         rand.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rand.best_params_['min_samples_split'] - 2,
                          rand.best_params_['min_samples_split'] - 1,
                          rand.best_params_['min_samples_split'], 
                          rand.best_params_['min_samples_split'] +1,
                          rand.best_params_['min_samples_split'] + 2],
    'n_estimators': [rand.best_params_['n_estimators'] - 200, rand.best_params_['n_estimators'] - 100, 
                     rand.best_params_['n_estimators'], 
                     rand.best_params_['n_estimators'] + 100, rand.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [670], 'max_features': ['auto'], 'min_samples_leaf': [8, 10, 12], 'min_samples_split': [1, 2, 3, 4, 5], 'n_estimators': [0, 100, 200, 300, 400]}


In [33]:
rf = RandomForestClassifier()
grid = GridSearchCV(rf,param_grid,cv=10,verbose=2)
grid.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=1, n_estimators=0; total time=

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, m

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=3, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=3, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=3, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=3, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=3, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=3, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, m

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=4, n_estimators=200; total time=   0.8s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=4, n_estimators=300; total time=   0.6s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=4, n_estimators=300; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=4, n_estimators=300; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=4, n_estimators=300; total time=   0.5s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=4, n_estimators=300; total time=   0.8s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=4, n_estimators=300; total time=   0.9s
[CV] END criterion=gini, max_depth=670, m

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.6s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END criterion=gini, max_depth=670, m

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=0; total time=   0.0s
[CV] END criterion=gini, max_depth=670, max_fe

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=3, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=3, n_estimators=100; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=3, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=3, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=3, n_estimators=100; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=200; total time=   0.2s
[CV] END criterion=gini, max_depth

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=300; total time=   0.8s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=10, min_samples_split=5, n_estimators=300; total time=   0.8s
[CV] END criterion=gini, max_depth

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=1, n_estimators=400; total time=   0.2s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=1, n_estimators=400; total time=   0.1s
[CV] END criterion=gini, max_depth

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=3, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=3, n_estimators=100; total time=   0.2s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=3, n_estimators=100; total time=   0.1s
[CV] END criterion=gini, max_depth

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=4, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=4, n_estimators=200; total time=   0.5s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=4, n_estimators=200; total time=   0.3s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=4, n_estimators=200; total time=   0.4s
[CV] END criterion=gini, max_depth

[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END criterion=gini, max_depth=670, max_features=auto, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=   0.6s
[CV] END criterion=gini, max_depth

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini'], 'max_depth': [670],
                         'max_features': ['auto'],
                         'min_samples_leaf': [8, 10, 12],
                         'min_samples_split': [1, 2, 3, 4, 5],
                         'n_estimators': [0, 100, 200, 300, 400]},
             verbose=2)

In [34]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 670,
 'max_features': 'auto',
 'min_samples_leaf': 12,
 'min_samples_split': 2,
 'n_estimators': 100}

In [35]:
grid.best_score_

0.7653622421998942

In [36]:
grid.best_estimator_

RandomForestClassifier(max_depth=670, min_samples_leaf=12)

In [37]:
y_pred = grid.predict(X_test)
print("Accuracy_score:\n",accuracy_score(y_test,y_pred))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("classification report:\n",classification_report(y_test,y_pred))

Accuracy_score:
 0.8311688311688312
confusion matrix:
 [[98  9]
 [17 30]]
classification report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.88       107
           1       0.77      0.64      0.70        47

    accuracy                           0.83       154
   macro avg       0.81      0.78      0.79       154
weighted avg       0.83      0.83      0.83       154



### Automated Hyperparameter Tuning

Automated Hyperparameter Tuning can be done by using techniques such as

- Bayesian Optimization
- Gradient Descent
- Evolutionary Algorithms

### Bayesian Optimization

Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time. In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.

- Objective Function = defines the loss function to minimize.
- Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
- Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.

In [38]:
! pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7


In [39]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [40]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [41]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x262cf9d4340>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x262cf9d44f0>,
 'max_features': <hyperopt.pyll.base.Apply at 0x262cf9d4610>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x262cf9d47f0>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x262cf9d4910>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x262cf9d4a00>}

In [44]:
from sklearn.model_selection import cross_val_score

In [45]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [46]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|███████████████████████████████████████████████| 80/80 [10:07<00:00,  7.60s/trial, best loss: -0.7638544582167134]


{'criterion': 1,
 'max_depth': 1100.0,
 'max_features': 0,
 'min_samples_leaf': 0.057595545016498564,
 'min_samples_split': 0.048516623331539804,
 'n_estimators': 1}

In [None]:
best["criterion"]

In [47]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
auto
50


In [48]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[99  8]
 [24 23]]
0.7922077922077922
              precision    recall  f1-score   support

           0       0.80      0.93      0.86       107
           1       0.74      0.49      0.59        47

    accuracy                           0.79       154
   macro avg       0.77      0.71      0.73       154
weighted avg       0.79      0.79      0.78       154



### Genetic Algorithms

Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.

Let's imagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again calculate the accuracy of each model and repeat the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.

In [49]:
params = {
 # Number of trees in random forest
"n_estimators": [int(x) for x in np.linspace(200,2000,10)],
 # Number of features to consider at every split
"max_features" : ["auto","sqrt","log2"],
 # Maximum number of levels in tree
"max_depth" : [int(x) for x in np.linspace(10,1000,10)],
 # Minimum number of samples required to split a node
"min_samples_split": [1,3,4,5,7,9],
 # Minimum number of samples required at each leaf node
"min_samples_leaf" : [1,2,4,6,8],
"criterion":["entropy","gini"]
    }

In [50]:
params

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
 'min_samples_split': [1, 3, 4, 5, 7, 9],
 'min_samples_leaf': [1, 2, 4, 6, 8],
 'criterion': ['entropy', 'gini']}

In [51]:
! pip install tpot

Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
Collecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting deap>=1.2
  Downloading deap-1.3.3-cp38-cp38-win_amd64.whl (109 kB)
Collecting update-checker>=0.16
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py): started
  Building wheel for stopit (setup.py): finished with status 'done'
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11959 sha256=438d30e909428e3e49cc8b81c2781f2f670c8ae0f7aa6c234f6a3fa5f596283c
  Stored in directory: c:\users\saura\appdata\local\pip\cache\wheels\a8\bb\8f\6b9328d23c2dcedbfeb8498b9f650d55d463089e3b8fc0bfb2
Successfully built stopit
Installing collected packages: stopit, deap, update-checker, tpot
Successfully installed deap-1.3.3 stopit-1.1.2 tpot-0.11.7 update-checker-0.18.0


In [53]:
from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': params}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=84.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.7670613700025465

Generation 2 - Current best internal CV score: 0.7670613700025465

Generation 3 - Current best internal CV score: 0.7670613700025465

Generation 4 - Current best internal CV score: 0.7670613700025465

Generation 5 - Current best internal CV score: 0.7670613700025465

Best pipeline: RandomForestClassifier(RandomForestClassifier(input_matrix, criterion=entropy, max_depth=560, max_features=sqrt, min_samples_leaf=8, min_samples_split=7, n_estimators=200), criterion=entropy, max_depth=670, max_features=log2, min_samples_leaf=4, min_samples_split=3, n_estimators=1600)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [54]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.8376623376623377


### Optimize hyperparameters of the model using Optuna

The hyperparameters of the above algorithm are n_estimators and max_depth for which we can try different values to see if the model accuracy can be improved. The objective function is modified to accept a trial object. This trial has several methods for sampling hyperparameters. We create a study to run the hyperparameter optimization and finally read the best hyperparameters.

In [55]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
Collecting importlib-resources; python_version < "3.9"
  Using cached importlib_resources-5.12.0-py3-none-any.whl (36 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
Collecting typing-extensions>=4
  Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Installing collected packages: cmaes, colorlog, importlib-resources, Mako, typing-extensions, alembic, optuna
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.7.4.3
    Uninstalling typing-extensions-3.7.4.3:
      Successfully uninstalled typing-extensions-3.7.4.3
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 importlib-resources-5.1

In [56]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [57]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2023-03-24 18:30:28,578][0m A new study created in memory with name: no-name-ce306800-fdfa-46e6-a404-5a9423a8fbfe[0m
[32m[I 2023-03-24 18:30:37,490][0m Trial 0 finished with value: 0.7475450342738722 and parameters: {'classifier': 'RandomForest', 'n_estimators': 810, 'max_depth': 69.66562030071431}. Best is trial 0 with value: 0.7475450342738722.[0m
[32m[I 2023-03-24 18:30:44,884][0m Trial 1 finished with value: 0.7491790212019768 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1100, 'max_depth': 93.92960304318582}. Best is trial 1 with value: 0.7491790212019768.[0m
[32m[I 2023-03-24 18:30:49,823][0m Trial 2 finished with value: 0.7556830862426271 and parameters: {'classifier': 'RandomForest', 'n_estimators': 570, 'max_depth': 21.17646430467971}. Best is trial 2 with value: 0.7556830862426271.[0m
[32m[I 2023-03-24 18:30:53,870][0m Trial 3 finished with value: 0.7475370636059302 and parameters: {'classifier': 'RandomForest', 'n_estimators': 840, 'max_

[32m[I 2023-03-24 18:32:42,417][0m Trial 35 finished with value: 0.7540650406504065 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1920, 'max_depth': 50.511503302987784}. Best is trial 15 with value: 0.7556910569105691.[0m
[32m[I 2023-03-24 18:32:42,551][0m Trial 36 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 116205.19094245274}. Best is trial 15 with value: 0.7556910569105691.[0m
[32m[I 2023-03-24 18:32:46,909][0m Trial 37 finished with value: 0.7491790212019768 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1080, 'max_depth': 64.68770810696059}. Best is trial 15 with value: 0.7556910569105691.[0m
[32m[I 2023-03-24 18:32:47,066][0m Trial 38 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 1045.9902926488496}. Best is trial 15 with value: 0.7556910569105691.[0m
[32m[I 2023-03-24 18:32:52,395][0m Trial 39 finished with value: 0.7475450342738722 and parameters: {'c

[32m[I 2023-03-24 18:35:53,255][0m Trial 70 finished with value: 0.7459269886816515 and parameters: {'classifier': 'RandomForest', 'n_estimators': 540, 'max_depth': 55.61555539999491}. Best is trial 66 with value: 0.7573250438386737.[0m
[32m[I 2023-03-24 18:35:56,306][0m Trial 71 finished with value: 0.7524469950581859 and parameters: {'classifier': 'RandomForest', 'n_estimators': 850, 'max_depth': 50.60068795937696}. Best is trial 66 with value: 0.7573250438386737.[0m
[32m[I 2023-03-24 18:35:59,357][0m Trial 72 finished with value: 0.7491790212019768 and parameters: {'classifier': 'RandomForest', 'n_estimators': 780, 'max_depth': 44.87849204807528}. Best is trial 66 with value: 0.7573250438386737.[0m
[32m[I 2023-03-24 18:36:03,682][0m Trial 73 finished with value: 0.7524310537223019 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1160, 'max_depth': 64.83194082173222}. Best is trial 66 with value: 0.7573250438386737.[0m
[32m[I 2023-03-24 18:36:08,772][0m Tr

Accuracy: 0.7573250438386737
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 620, 'max_depth': 43.5998033941483}


In [58]:
trial

FrozenTrial(number=66, state=TrialState.COMPLETE, values=[0.7573250438386737], datetime_start=datetime.datetime(2023, 3, 24, 18, 35, 44, 547324), datetime_complete=datetime.datetime(2023, 3, 24, 18, 35, 46, 904922), params={'classifier': 'RandomForest', 'n_estimators': 620, 'max_depth': 43.5998033941483}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntDistribution(high=2000, log=False, low=200, step=10), 'max_depth': FloatDistribution(high=100.0, log=True, low=10.0, step=None)}, trial_id=66, value=None)

In [59]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 620,
 'max_depth': 43.5998033941483}

In [60]:
rf=RandomForestClassifier(n_estimators=330,max_depth=30)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=30, n_estimators=330)

In [61]:
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[95 12]
 [17 30]]
0.8116883116883117
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       107
           1       0.71      0.64      0.67        47

    accuracy                           0.81       154
   macro avg       0.78      0.76      0.77       154
weighted avg       0.81      0.81      0.81       154

