In [42]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import mlflow

optuna.logging.set_verbosity(optuna.logging.WARNING)

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("experiment-001")

<Experiment: artifact_location='./mlruns/1', creation_time=1667238424038, experiment_id='1', last_update_time=1667238424038, lifecycle_stage='active', name='experiment-001', tags={}>

In [25]:
X, y = make_classification(
    n_samples=250, 
    n_features=10,
    n_informative=5, 
    n_redundant=3,
    random_state=42, 
    shuffle=True
)

X.shape, y.shape

((250, 10), (250,))

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.20, 
    random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((200, 10), (50, 10), (200,), (50,))

In [29]:
%%time

with mlflow.start_run():
    mlflow.sklearn.autolog(exclusive=False)
    
    n_estimators = 50
    max_depth = 5
    
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("n_estimators", n_estimators)
    
    model = RandomForestClassifier(
        random_state=42, 
        max_depth=max_depth,
        n_estimators=n_estimators
    )
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    # y_proba = model.predict_proba(X_test)
    
    mlflow.log_dict(
        {
            "y_test": [int(x) for x in y_test],
            "y_pred": [int(x) for x in y_pred]
        }, 
        "ytest-ypred.json"
    )
    
    test_acc = accuracy_score(y_test, y_pred)
    
    test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
        y_test, 
        y_pred, 
        average='binary'
    )
    
    
    mlflow.log_metric("test_accuracy", test_acc)
    mlflow.log_metric("test_precision", test_precision)
    mlflow.log_metric("test_recall", test_recall)
    mlflow.log_metric("test_f1_score", test_f1)
    
    print("test_accuracy:", test_acc)
    print("test_precision:", test_precision)
    print("test_recall:", test_recall)
    print("test_f1_score:", test_f1)
    
    mlflow.sklearn.autolog(disable=True)

test_accuracy: 0.84
test_precision: 1.0
test_recall: 0.7037037037037037
test_f1_score: 0.8260869565217391
CPU times: total: 1.31 s
Wall time: 5.48 s


## RandomizedSearchCV

In [50]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
SEED=42

# Number of trees in random forest
n_estimators = [int(x) for x in range(100,505,100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator=RandomForestClassifier(), 
    param_distributions=random_grid,
    scoring="average_precision",
    random_state=SEED,
    n_iter=50,
    verbose=2,
    n_jobs=4,
    cv=5,
)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
CPU times: total: 734 ms
Wall time: 1min 4s


In [51]:
y_pred = rf_random.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)

test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    y_test, 
    y_pred, 
    average='binary'
)

print("test_accuracy:", test_acc)
print("test_precision:", test_precision)
print("test_recall:", test_recall)
print("test_f1_score:", test_f1)

test_accuracy: 0.82
test_precision: 0.875
test_recall: 0.7777777777777778
test_f1_score: 0.823529411764706


In [57]:
rf_random.cv_results_["mean_test_score"].mean()

0.9169154567638319

## Optuna

In [None]:
from sklearn import model_selection


#Step 1. Define an objective function to be maximized.
def objective(trial):   
    # Step 2. Setup values for the hyperparameters:
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
    classifier_obj = RandomForestClassifier(
        max_depth=rf_max_depth, 
        n_estimators=rf_n_estimators
    )

    # Step 3: Scoring method:
    score = model_selection.cross_val_score(classifier_obj, X, y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

In [44]:
%%time

def objective(trial):
    # Number of trees in random forest
    n_estimators = trial.suggest_int(name="n_estimators", low=100, high=500, step=100)

    # Number of features to consider at every split
    max_features = trial.suggest_categorical(name="max_features", choices=['auto', 'sqrt']) 

    # Maximum number of levels in tree
    max_depth = trial.suggest_int(name="max_depth", low=10, high=110, step=20)

    # Minimum number of samples required to split a node
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)

    # Minimum number of samples required at each leaf node
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    
    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }
    
    model = RandomForestClassifier(random_state=SEED, **params)
    
    cv_score = cross_val_score(model, X_train, y_train, n_jobs=4, cv=5)
    mean_cv_accuracy = cv_score.mean()

    return mean_cv_accuracy

optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study()
study.optimize(objective, n_trials=5)

CPU times: total: 15.6 ms
Wall time: 9.18 s


In [48]:
study.best_params

{'n_estimators': 500,
 'max_features': 'auto',
 'max_depth': 30,
 'min_samples_split': 2,
 'min_samples_leaf': 3}

In [49]:
best_model = RandomForestClassifier(random_state=SEED, **study.best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)

test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    y_test, 
    y_pred, 
    average='binary'
)

print("test_accuracy:", test_acc)
print("test_precision:", test_precision)
print("test_recall:", test_recall)
print("test_f1_score:", test_f1)

  warn(


test_accuracy: 0.82
test_precision: 0.9090909090909091
test_recall: 0.7407407407407407
test_f1_score: 0.8163265306122449
