In [1]:
import os
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score
from sklearn import metrics
import joblib
from tqdm import tqdm
from catboost import CatBoostClassifier
import optuna

In [2]:
data = np.load('feature_dataset.npy', allow_pickle='TRUE').item()

## OPTUNA

In [3]:
#Step 1. Define an objective function to be maximized.
def objective(trial):    
    # Step 2. Setup values for the hyperparameters:
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
    classifier_obj = RandomForestClassifier(
                                            max_depth=rf_max_depth,
                                            n_estimators=rf_n_estimators,
                                            random_state=42
                                            )

    # Step 3: Scoring method:
    score = cross_val_score(classifier_obj, data['features'], data['labels'], n_jobs=-1, cv=KFold(shuffle=True, random_state=42))
    accuracy = score.mean()
    return accuracy

# Step 4: Running it
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)

[I 2023-07-04 13:29:45,263] A new study created in memory with name: no-name-ed76451b-fe47-449d-8123-b4255c96c933
[I 2023-07-04 13:29:51,496] Trial 0 finished with value: 0.843553981789276 and parameters: {'rf_n_estimators': 976, 'rf_max_depth': 6}. Best is trial 0 with value: 0.843553981789276.
[I 2023-07-04 13:29:53,715] Trial 1 finished with value: 0.8376644023702848 and parameters: {'rf_n_estimators': 367, 'rf_max_depth': 5}. Best is trial 0 with value: 0.843553981789276.
[I 2023-07-04 13:29:58,602] Trial 2 finished with value: 0.8528701161054102 and parameters: {'rf_n_estimators': 776, 'rf_max_depth': 7}. Best is trial 2 with value: 0.8528701161054102.
[I 2023-07-04 13:30:02,126] Trial 3 finished with value: 0.8528701161054102 and parameters: {'rf_n_estimators': 542, 'rf_max_depth': 7}. Best is trial 2 with value: 0.8528701161054102.
[I 2023-07-04 13:30:03,155] Trial 4 finished with value: 0.8666028327793034 and parameters: {'rf_n_estimators': 86, 'rf_max_depth': 29}. Best is tria

In [4]:
print(f"The best value is : \n{study.best_value}")
print(f"The best parameters are : \n{study.best_params}")

The best value is : 
0.8705244014067544
The best parameters are : 
{'rf_n_estimators': 831, 'rf_max_depth': 18}


## Making rf model

In [5]:
model = RandomForestClassifier(max_depth=18, n_estimators=831, random_state=42)
score_types = ('accuracy', 'roc_auc', 'f1')

In [6]:
result = cross_validate(model, data['features'], data['labels'],
                        cv=KFold(shuffle=True, random_state=42),
                        scoring=score_types,
                        return_estimator=True,
                        return_train_score=True,
                       )

In [7]:
for score in score_types:
    mean = np.mean(result[f'test_{score}'])
    std = np.std(result[f'test_{score}'])
    print(f'{score} = {mean:.3f} +- {std:.3f}')

accuracy = 0.871 +- 0.010
roc_auc = 0.943 +- 0.012
f1 = 0.871 +- 0.013


In [1]:
np.mean(result['test_accuracy']) < 0.7

NameError: name 'np' is not defined

## Convert a model to ONNX

In [11]:
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn

In [16]:
initial_type = [('float_input', FloatTensorType([None, 42]))]
onx = convert_sklearn(result['estimator'][0], initial_types=initial_type)

with open("rf_RBclf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

## Run ONNX session

In [18]:
import onnxruntime as rt

In [22]:
sess = rt.InferenceSession("rf_RBclf.onnx", providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run(
    [label_name], {input_name: np.array(data['features']).astype(np.float32)})[0]
print(pred_onx)

[1 1 0 ... 0 0 0]


In [23]:
metrics.accuracy_score(data['labels'], pred_onx)

0.9730259931338892

## Probabilities

In [25]:
prob_name = sess.get_outputs()[1].name
prob_rt = sess.run([prob_name], {input_name: np.array(data['features']).astype(np.float32)})[0]


In [26]:
prob_rt[0]

{0: 0.12515044212341309, 1: 0.8748495578765869}