In [19]:
import ray
from ray import tune
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_covtype
import numpy as np

In [20]:
# Initialize Ray
ray.init()

2024-05-10 19:31:41,819	INFO worker.py:1567 -- Connecting to existing Ray cluster at address: 192.168.2.209:6379...
2024-05-10 19:31:41,832	INFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.10
Ray version:,2.10.0
Dashboard:,http://127.0.0.1:8265


In [21]:
covtype = fetch_covtype()
X, y = covtype.data, covtype.target

# select the first 4000 data points from the dataset
X_ray_put=X[:40000]
y_ray_put=y[:40000]

def train_rf(config):
    # Load dataset
    # Initialize and train RandomForestClassifier
    rf_classifier= RandomForestClassifier(**config)
    scores = cross_val_score(rf_classifier, X_ray_put, y_ray_put , cv=5)
   
    # Compute mean accuracy
    mean_accuracy = np.mean(scores)
    return {"mean_accuracy": mean_accuracy}

# Define search space
search_space = {
    "max_depth": tune.grid_search([10,50]),
    "n_estimators": tune.grid_search([200,300]),
    "ccp_alpha": tune.grid_search([0,0.2])
}

In [25]:
tuner = tune.Tuner(train_rf, param_space=search_space)  # ③
results = tuner.fit()

0,1
Current time:,2024-05-10 19:44:50
Running for:,00:02:17.29
Memory:,6.2/7.8 GiB

Trial name,status,loc,ccp_alpha,max_depth,n_estimators,acc,iter,total time (s)
train_rf_71b0f_00000,TERMINATED,192.168.2.136:60286,0.0,10,200,0.763725,1,25.4846
train_rf_71b0f_00001,TERMINATED,192.168.2.122:57879,0.2,10,200,0.54095,1,25.0622
train_rf_71b0f_00002,TERMINATED,192.168.2.122:57880,0.0,50,200,0.79645,1,44.1916
train_rf_71b0f_00003,TERMINATED,192.168.2.209:92490,0.2,50,200,0.54095,1,129.462
train_rf_71b0f_00004,TERMINATED,192.168.2.122:57881,0.0,10,300,0.7637,1,38.6575
train_rf_71b0f_00005,TERMINATED,192.168.2.209:92491,0.2,10,300,0.54095,1,68.0253
train_rf_71b0f_00006,TERMINATED,192.168.2.136:60287,0.0,50,300,0.796975,1,65.6788
train_rf_71b0f_00007,TERMINATED,192.168.2.136:60288,0.2,50,300,0.54095,1,111.778




You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-05-10 19:44:50,144	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to '/home/ubuntu/ray_results/train_rf_2024-05-10_19-42-32' in 0.0132s.
2024-05-10 19:44:50,159	INFO tune.py:1048 -- Total run time: 137.45 seconds (137.27 seconds for the tuning loop).


In [28]:
best_result = results.get_best_result(metric="mean_accuracy", mode="max")
print("Best Accuracy:", round(best_result.metrics["mean_accuracy"]*100,3),"%")
print(best_result.config)

Best Accuracy: 79.698 %
{'max_depth': 50, 'n_estimators': 300, 'ccp_alpha': 0}


In [24]:
ray.shutdown()

# default

In [40]:
from sklearn.ensemble import RandomForestClassifier
# Get the default parameters of the RandomForestClassifier
default_params = rf_default_classifier= RandomForestClassifier().get_params()

# Print out the default parameters
print("Default parameters of RandomForestClassifier:")
for param, value in default_params.items():
    print(param, ":", value)

#max_depth : None
#n_estimators : 100
#ccp_alpha:0

Default parameters of RandomForestClassifier:
bootstrap : True
ccp_alpha : 0.0
class_weight : None
criterion : gini
max_depth : None
max_features : sqrt
max_leaf_nodes : None
max_samples : None
min_impurity_decrease : 0.0
min_samples_leaf : 1
min_samples_split : 2
min_weight_fraction_leaf : 0.0
n_estimators : 100
n_jobs : None
oob_score : False
random_state : None
verbose : 0
warm_start : False


In [43]:
def train_default_rf(config):
    # Load dataset
    # Initialize and train RandomForestClassifier
    rf_classifier= RandomForestClassifier(**config)
    scores = cross_val_score(rf_classifier, X_ray_put, y_ray_put , cv=5)
   
    # Compute mean accuracy
    mean_accuracy = np.mean(scores)
    return {"mean_accuracy": mean_accuracy}


#max_depth : None
#n_estimators : 100
#ccp_alpha:0

# Define search space
default_search_space = {
    "max_depth": tune.grid_search([None]),
    "n_estimators": tune.grid_search([100]),
    "ccp_alpha": tune.grid_search([0])
}

In [42]:
tuner = tune.Tuner(train_default_rf, param_space=default_search_space)  # ③
default_results = tuner.fit()

0,1
Current time:,2024-05-10 20:10:53
Running for:,00:00:24.23
Memory:,7.0/7.8 GiB

Trial name,status,loc,ccp_alpha,max_depth,n_estimators,acc,iter,total time (s)
train_default_rf_58bd2_00000,TERMINATED,192.168.2.136:61125,0,,100,0.793725,1,21.9071


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-05-10 20:10:53,217	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to '/home/ubuntu/ray_results/train_default_rf_2024-05-10_20-10-28' in 0.0068s.
2024-05-10 20:10:53,224	INFO tune.py:1048 -- Total run time: 24.39 seconds (24.23 seconds for the tuning loop).


In [44]:
best_result = default_results.get_best_result(metric="mean_accuracy", mode="max")
print("Best Accuracy:", round(best_result.metrics["mean_accuracy"]*100,3),"%")
print(best_result.config)

Best Accuracy: 79.372 %
{'max_depth': None, 'n_estimators': 100, 'ccp_alpha': 0}
