# Distributed HPO with Ray

##### Training a simple XGBoost classifier without Ray

In [1]:
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb


def train_breast_cancer(config):
    # Load dataset
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)
    # Build input matrices for XGBoost
    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)
    # Train the classifier
    results = {}
    bst = xgb.train(
        config,
        train_set,
        evals=[(test_set, "eval")],
        evals_result=results,
        verbose_eval=False,
    )
    return results


results = train_breast_cancer(
    {"objective": "binary:logistic", "eval_metric": ["logloss", "error"]}
)
accuracy = 1.0 - results["eval"]["error"][-1]
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9301


##### Still Training with some parameters but without Ray

In [2]:
config = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "max_depth": 2,
    "min_child_weight": 0,
    "subsample": 0.8,
    "eta": 0.2,
}
results = train_breast_cancer(config)
accuracy = 1.0 - results["eval"]["error"][-1]
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9301


### What distributed HPO looks like with Ray

In [3]:
import sklearn.datasets
import sklearn.metrics

from ray import train, tune

In [4]:
def train_breast_cancer(config):
    # Load dataset
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)
    # Build input matrices for XGBoost
    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)
    # Train the classifier
    results = {}
    xgb.train(
        config,
        train_set,
        evals=[(test_set, "eval")],
        evals_result=results,
        verbose_eval=False,
    )
    # Return prediction accuracy
    accuracy = 1.0 - results["eval"]["error"][-1]
    train.report({"mean_accuracy": accuracy, "done": True})

In [5]:
config = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "max_depth": tune.randint(1, 9),
    "min_child_weight": tune.choice([1, 2, 3]),
    "subsample": tune.uniform(0.5, 1.0),
    "eta": tune.loguniform(1e-4, 1e-1),
}

In [6]:
tuner = tune.Tuner(
    train_breast_cancer,
    tune_config=tune.TuneConfig(num_samples=10),
    param_space=config,
)

In [7]:
import ray
try:
    ray.shutdown()
except:
    pass
ray.init(address="auto")

2024-04-24 03:15:01,053	INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 10.244.21.11:6379...
2024-04-24 03:15:03,161	INFO worker.py:1715 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://10.244.21.11:8265 [39m[22m
[2024-04-24 03:15:03,164 I 8619 8619] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1


0,1
Python version:,3.8.10
Ray version:,2.9.3
Dashboard:,http://10.244.21.11:8265


[33m(raylet, ip=10.244.22.11)[0m [2024-04-24 03:15:16,685 I 2942 2942] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[33m(raylet, ip=10.244.24.11)[0m [2024-04-24 03:15:46,010 I 2996 2996] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[36m(train_breast_cancer pid=3029, ip=10.244.23.11)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/artifacts/earlystopping/train_breast_cancer_2024-04-24_03-15-45/train_breast_cancer_f0d16_00003_3_eta=0.0930,max_depth=6,min_child_weight=3,subsample=0.7415_2024-04-24_03-15-45/checkpoint_000000)
[33m(raylet)[0m [2024-04-24 03:15:50,802 I 8919 8919] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1[32m [repeated 13x across cluster][0m
[33m(raylet, ip=10.244.24.11)[0m [2024-04-24 03:16:12,529 I 3143 3143] logging.cc:230: Set ray log level from environment variable RAY_BACKEND_LOG_LEVEL to -1
[36m

In [8]:
results = tuner.fit()

0,1
Current time:,2024-04-24 03:15:20
Running for:,00:00:04.70
Memory:,5.1/31.0 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,acc,iter,total time (s)
train_breast_cancer_de22c_00000,TERMINATED,10.244.23.11:2945,0.000278238,4,2,0.776999,0.713287,1,0.0205131
train_breast_cancer_de22c_00001,TERMINATED,10.244.23.11:2945,0.00020188,8,2,0.848902,0.608392,1,0.0217369
train_breast_cancer_de22c_00002,TERMINATED,10.244.23.11:2945,0.0041078,3,3,0.544379,0.615385,1,0.0171645
train_breast_cancer_de22c_00003,TERMINATED,10.244.23.11:2945,0.000183119,1,3,0.578556,0.615385,1,0.013093
train_breast_cancer_de22c_00004,TERMINATED,10.244.23.11:2945,0.00445188,7,3,0.621606,0.629371,1,0.0188844
train_breast_cancer_de22c_00005,TERMINATED,10.244.23.11:2945,0.000303566,7,2,0.961211,0.594406,1,0.0645039
train_breast_cancer_de22c_00006,TERMINATED,10.244.23.11:2945,0.00225683,7,1,0.889144,0.622378,1,0.0280683
train_breast_cancer_de22c_00007,TERMINATED,10.244.23.11:2945,0.000106273,8,1,0.776135,0.594406,1,0.0245171
train_breast_cancer_de22c_00008,TERMINATED,10.244.23.11:2945,0.00501698,6,1,0.619681,0.608392,1,0.0225742
train_breast_cancer_de22c_00009,TERMINATED,10.244.23.11:2945,0.000588589,2,2,0.972956,0.594406,1,0.015151


2024-04-24 03:15:20,381	ERROR worker.py:405 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2024-04-24 03:15:20,407	INFO tune.py:1042 -- Total run time: 6.15 seconds (4.69 seconds for the tuning loop).


### Early stopping in HPO

In [9]:
import sklearn.datasets
import sklearn.metrics
from ray.tune.schedulers import ASHAScheduler
from sklearn.model_selection import train_test_split
import xgboost as xgb

from ray import tune
from ray.tune.integration.xgboost import TuneReportCheckpointCallback

In [10]:
def train_breast_cancer(config: dict):
    # This is a simple training function to be passed into Tune
    # Load dataset
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)
    # Build input matrices for XGBoost
    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)
    # Train the classifier, using the Tune callback
    xgb.train(
        config,
        train_set,
        evals=[(test_set, "eval")],
        verbose_eval=False,
        # `TuneReportCheckpointCallback` defines the checkpointing frequency and format.
        callbacks=[TuneReportCheckpointCallback(frequency=1)],
    )

In [11]:
def get_best_model_checkpoint(results):
    best_result = results.get_best_result()

    # `TuneReportCheckpointCallback` provides a helper method to retrieve the
    # model from a checkpoint.
    best_bst = TuneReportCheckpointCallback.get_model(best_result.checkpoint)

    accuracy = 1.0 - best_result.metrics["eval-error"]
    print(f"Best model parameters: {best_result.config}")
    print(f"Best model total accuracy: {accuracy:.4f}")
    return best_bst


In [12]:
def tune_xgboost(smoke_test=False):
    search_space = {
        # You can mix constants with search space objects.
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "max_depth": tune.randint(1, 9),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1),
    }
    # This will enable aggressive early stopping of bad trials.
    scheduler = ASHAScheduler(
        max_t=10, grace_period=1, reduction_factor=2  # 10 training iterations
    )

    tuner = tune.Tuner(
        train_breast_cancer,
        tune_config=tune.TuneConfig(
            metric="eval-logloss",
            mode="min",
            scheduler=scheduler,
            num_samples=1 if smoke_test else 10,
        ),
        run_config=ray.train.RunConfig(
            storage_path="/home/artifacts/earlystopping/"
        ),
        param_space=search_space,
    )
    results = tuner.fit()
    return results

In [13]:
results = tune_xgboost(smoke_test=False)

0,1
Current time:,2024-04-24 03:15:56
Running for:,00:00:10.59
Memory:,5.4/31.0 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),eval-logloss,eval-error
train_breast_cancer_f0d16_00000,TERMINATED,10.244.24.11:3058,0.0166545,3,3,0.679804,1,0.479793,0.65388,0.384615
train_breast_cancer_f0d16_00001,TERMINATED,10.244.22.11:3085,0.00338162,2,1,0.712189,1,0.421662,0.667979,0.391608
train_breast_cancer_f0d16_00002,TERMINATED,10.244.22.11:3086,0.000129086,2,1,0.885581,1,0.499266,0.67863,0.405594
train_breast_cancer_f0d16_00003,TERMINATED,10.244.23.11:3029,0.0929771,6,3,0.74152,10,3.52023,0.295469,0.0629371
train_breast_cancer_f0d16_00004,TERMINATED,10.244.20.11:3103,0.0783609,4,2,0.74078,2,0.800395,0.55575,0.384615
train_breast_cancer_f0d16_00005,TERMINATED,10.244.22.11:3085,0.0276513,1,2,0.895466,1,0.338289,0.663348,0.412587
train_breast_cancer_f0d16_00006,TERMINATED,10.244.24.11:3058,0.025981,2,1,0.577579,2,0.731913,0.615517,0.356643
train_breast_cancer_f0d16_00007,TERMINATED,10.244.22.11:3086,0.00723173,2,3,0.756581,2,0.699378,0.62837,0.321678
train_breast_cancer_f0d16_00008,TERMINATED,10.244.24.11:3093,0.00034739,8,1,0.627257,1,0.404846,0.696648,0.433566
train_breast_cancer_f0d16_00009,TERMINATED,10.244.22.11:3085,0.0197735,6,3,0.525728,2,0.72703,0.617642,0.335664


2024-04-24 03:15:56,222	INFO tune.py:1042 -- Total run time: 11.07 seconds (7.47 seconds for the tuning loop).


### Using fractional GPUs

In [16]:
config = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "tree_method": "gpu_hist",
    "max_depth": tune.randint(1, 9),
    "min_child_weight": tune.choice([1, 2, 3]),
    "subsample": tune.uniform(0.5, 1.0),
    "eta": tune.loguniform(1e-4, 1e-1),
}

tuner = tune.Tuner(
    tune.with_resources(train_breast_cancer, resources={"cpu": 1, "gpu": 0.1}),
    tune_config=tune.TuneConfig(num_samples=10),
    run_config=ray.train.RunConfig(
            storage_path="/home/artifacts/gpu"
        ),
    param_space=config,
)
results = tuner.fit()

0,1
Current time:,2024-04-24 03:17:28
Running for:,00:00:18.70
Memory:,5.3/31.0 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),eval-logloss,eval-error
train_breast_cancer_232f5_00000,TERMINATED,10.244.23.11:3343,0.0772753,4,1,0.733794,10,4.21903,0.314298,0.0559441
train_breast_cancer_232f5_00001,TERMINATED,10.244.24.11:3348,0.0161289,1,3,0.706106,10,4.79754,0.553304,0.34965
train_breast_cancer_232f5_00002,TERMINATED,10.244.20.11:3319,0.0428752,6,1,0.671875,10,4.4543,0.423445,0.0699301
train_breast_cancer_232f5_00003,TERMINATED,10.244.20.11:3320,0.000303146,3,2,0.674874,10,4.34583,0.647216,0.34965
train_breast_cancer_232f5_00004,TERMINATED,10.244.22.11:3340,0.000415614,2,1,0.922785,10,4.4214,0.663412,0.384615
train_breast_cancer_232f5_00005,TERMINATED,10.244.23.11:3374,0.0692545,7,1,0.832347,10,4.02456,0.354355,0.048951
train_breast_cancer_232f5_00006,TERMINATED,10.244.22.11:3341,0.00235593,4,3,0.982298,10,4.28943,0.639456,0.363636
train_breast_cancer_232f5_00007,TERMINATED,10.244.24.11:3380,0.0519934,4,3,0.876504,10,4.67109,0.404876,0.0909091
train_breast_cancer_232f5_00008,TERMINATED,10.244.23.11:3343,0.000477425,3,3,0.866936,10,3.55776,0.652177,0.363636
train_breast_cancer_232f5_00009,TERMINATED,10.244.23.11:3374,0.0211406,3,1,0.519995,10,3.54654,0.527726,0.384615


2024-04-24 03:17:28,828	INFO tune.py:1042 -- Total run time: 19.31 seconds (15.80 seconds for the tuning loop).


### Using full GPUs

In [14]:
print(ray.cluster_resources())

{'CPU': 40.0, 'custom_llm_serving_label': 4.0, 'GPU': 5.0, 'memory': 112660925646.0, 'object_store_memory': 49694317362.0, 'accelerator_type:A10G': 5.0, 'node:10.244.24.11': 1.0, 'node:10.244.21.11': 1.0, 'node:__internal_head__': 1.0, 'node:10.244.23.11': 1.0, 'node:10.244.22.11': 1.0, 'node:10.244.20.11': 1.0}


In [15]:
config = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "tree_method": "gpu_hist",
    "max_depth": tune.randint(1, 9),
    "min_child_weight": tune.choice([1, 2, 3]),
    "subsample": tune.uniform(0.5, 1.0),
    "eta": tune.loguniform(1e-4, 1e-1),
}

tuner = tune.Tuner(
    tune.with_resources(train_breast_cancer, resources={"cpu": 8, "gpu": 1}),
    tune_config=tune.TuneConfig(num_samples=10),
    run_config=ray.train.RunConfig(
            storage_path="/home/artifacts/gpufull"
        ),
    param_space=config,
)
results = tuner.fit()

0,1
Current time:,2024-04-24 03:16:28
Running for:,00:00:16.11
Memory:,5.6/31.0 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),eval-logloss,eval-error
train_breast_cancer_00a1d_00000,TERMINATED,10.244.22.11:3166,0.0179594,3,1,0.984443,10,4.0411,0.549803,0.41958
train_breast_cancer_00a1d_00001,TERMINATED,10.244.20.11:3178,0.00421678,5,1,0.729926,10,4.24851,0.6206,0.356643
train_breast_cancer_00a1d_00002,TERMINATED,10.244.23.11:3171,0.00939542,2,2,0.78684,10,4.48187,0.597924,0.377622
train_breast_cancer_00a1d_00003,TERMINATED,10.244.21.11:9064,0.00159404,3,1,0.894546,10,3.88173,0.629469,0.328671
train_breast_cancer_00a1d_00004,TERMINATED,10.244.24.11:3177,0.0299336,2,1,0.501463,10,4.36485,0.489318,0.111888
train_breast_cancer_00a1d_00005,TERMINATED,10.244.22.11:3166,0.000822765,2,3,0.995724,10,3.37216,0.672394,0.405594
train_breast_cancer_00a1d_00006,TERMINATED,10.244.20.11:3178,0.0553259,1,2,0.927009,10,3.78218,0.408695,0.0839161
train_breast_cancer_00a1d_00007,TERMINATED,10.244.23.11:3171,0.000513916,2,1,0.886468,10,3.84492,0.687948,0.426573
train_breast_cancer_00a1d_00008,TERMINATED,10.244.24.11:3177,0.0099043,7,2,0.607801,10,3.66883,0.627885,0.447552
train_breast_cancer_00a1d_00009,TERMINATED,10.244.21.11:9064,0.000425053,1,2,0.732263,10,3.38077,0.656703,0.370629


2024-04-24 03:16:28,266	INFO tune.py:1042 -- Total run time: 16.57 seconds (13.40 seconds for the tuning loop).
