# Scaling up hyperparameter optimization with multi-GPU workload on Kubernetes

In [1]:
!pip install dask_kubernetes optuna

Collecting dask_kubernetes
  Downloading dask_kubernetes-2024.5.0-py3-none-any.whl.metadata (4.2 kB)
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting kopf>=1.35.3 (from dask_kubernetes)
  Downloading kopf-1.37.2-py3-none-any.whl.metadata (9.7 kB)
Collecting kr8s==0.14.* (from dask_kubernetes)
  Downloading kr8s-0.14.4-py3-none-any.whl.metadata (6.7 kB)
Collecting kubernetes-asyncio>=12.0.1 (from dask_kubernetes)
  Downloading kubernetes_asyncio-29.0.0-py3-none-any.whl.metadata (1.3 kB)
Collecting kubernetes>=12.0.1 (from dask_kubernetes)
  Downloading kubernetes-29.0.0-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting pykube-ng>=22.9.0 (from dask_kubernetes)
  Downloading pykube_ng-23.6.0-py3-none-any.whl.metadata (8.0 kB)
Collecting asyncache>=0.3.1 (from kr8s==0.14.*->dask_kubernetes)
  Downloading asyncache-0.3.1-py3-none-any.whl.metadata (2.0 kB)
Collecting cryptography>=35 (from kr8s==0.14.*->dask_kubernetes)
  Downloading cryptography-42.0

In [29]:
import time
import threading
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

import cupy as cp
import cuspatial
import dask_cudf
import numpy as np
import optuna
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
from dask_ml.metrics import mean_squared_error
from dask_ml.model_selection import KFold
from xgboost import dask as dxgb
from cuml.dask.common import utils as dask_utils
from dask_kubernetes.operator import KubeCluster

In [2]:
# Choose the same RAPIDS image you used for launching the notebook session
rapids_image = "rapidsai/notebooks:24.04-cuda12.2-py3.11"

In [4]:
clusters = []
clusters.append(
    KubeCluster(
        name="rapids-dask",
        image=rapids_image,
        worker_command="dask-cuda-worker",
        n_workers=2,
        resources={"limits": {"nvidia.com/gpu": "1"}},
        env={"EXTRA_PIP_PACKAGES": "optuna"},
    )
)

clusters.append(
    KubeCluster(
        name="rapids-dask2",
        image=rapids_image,
        worker_command="dask-cuda-worker",
        n_workers=2,
        resources={"limits": {"nvidia.com/gpu": "1"}},
        env={"EXTRA_PIP_PACKAGES": "optuna"},
    )
)

Output()

Output()

In [9]:
clusters[0]

0,1
Dashboard: http://rapids-dask-scheduler.kubeflow-user-example-com:8787/status,Workers: 2
Total threads: 2,Total memory: 235.86 GiB

0,1
Comm: tcp://10.84.4.24:8786,Workers: 2
Dashboard: http://10.84.4.24:8787/status,Total threads: 2
Started: 5 minutes ago,Total memory: 235.86 GiB

0,1
Comm: tcp://10.84.2.16:41913,Total threads: 1
Dashboard: http://10.84.2.16:8788/status,Memory: 117.93 GiB
Nanny: tcp://10.84.2.16:36995,
Local directory: /tmp/dask-scratch-space/worker-e81qmxvv,Local directory: /tmp/dask-scratch-space/worker-e81qmxvv

0,1
Comm: tcp://10.84.0.19:42947,Total threads: 1
Dashboard: http://10.84.0.19:8788/status,Memory: 117.93 GiB
Nanny: tcp://10.84.0.19:46359,
Local directory: /tmp/dask-scratch-space/worker-6kfod7ex,Local directory: /tmp/dask-scratch-space/worker-6kfod7ex


In [12]:
clusters[1]

0,1
Dashboard: http://rapids-dask2-scheduler.kubeflow-user-example-com:8787/status,Workers: 1
Total threads: 1,Total memory: 117.93 GiB

0,1
Comm: tcp://10.84.1.16:8786,Workers: 1
Dashboard: http://10.84.1.16:8787/status,Total threads: 1
Started: 13 minutes ago,Total memory: 117.93 GiB

0,1
Comm: tcp://10.84.6.5:43917,Total threads: 1
Dashboard: http://10.84.6.5:8788/status,Memory: 117.93 GiB
Nanny: tcp://10.84.6.5:35899,
Local directory: /tmp/dask-scratch-space/worker-x31rxz7s,Local directory: /tmp/dask-scratch-space/worker-x31rxz7s


In [14]:
col_dtype = {
    "VendorID": "int32",
    "tpep_pickup_datetime": "datetime64[ms]",
    "tpep_dropoff_datetime": "datetime64[ms]",
    "passenger_count": "int32",
    "trip_distance": "float32",
    "pickup_longitude": "float32",
    "pickup_latitude": "float32",
    "RatecodeID": "int32",
    "store_and_fwd_flag": "int32",
    "dropoff_longitude": "float32",
    "dropoff_latitude": "float32",
    "payment_type": "int32",
    "fare_amount": "float32",
    "extra": "float32",
    "mta_tax": "float32",
    "tip_amount": "float32",
    "total_amount": "float32",
    "tolls_amount": "float32",
    "improvement_surcharge": "float32",
}


must_haves = {
    "pickup_datetime": "datetime64[ms]",
    "dropoff_datetime": "datetime64[ms]",
    "passenger_count": "int32",
    "trip_distance": "float32",
    "pickup_longitude": "float32",
    "pickup_latitude": "float32",
    "rate_code": "int32",
    "dropoff_longitude": "float32",
    "dropoff_latitude": "float32",
    "fare_amount": "float32",
}


def compute_haversine_distance(df):
    pickup = cuspatial.GeoSeries.from_points_xy(
        df[["pickup_longitude", "pickup_latitude"]].interleave_columns()
    )
    dropoff = cuspatial.GeoSeries.from_points_xy(
        df[["dropoff_longitude", "dropoff_latitude"]].interleave_columns()
    )
    df["haversine_distance"] = cuspatial.haversine_distance(pickup, dropoff)
    df["haversine_distance"] = df["haversine_distance"].astype("float32")
    return df


def clean(ddf, must_haves):
    # replace the extraneous spaces in column names and lower the font type
    tmp = {col: col.strip().lower() for col in list(ddf.columns)}
    ddf = ddf.rename(columns=tmp)

    ddf = ddf.rename(
        columns={
            "tpep_pickup_datetime": "pickup_datetime",
            "tpep_dropoff_datetime": "dropoff_datetime",
            "ratecodeid": "rate_code",
        }
    )

    ddf["pickup_datetime"] = ddf["pickup_datetime"].astype("datetime64[ms]")
    ddf["dropoff_datetime"] = ddf["dropoff_datetime"].astype("datetime64[ms]")

    for col in ddf.columns:
        if col not in must_haves:
            ddf = ddf.drop(columns=col)
            continue
        if ddf[col].dtype == "object":
            # Fixing error: could not convert arg to str
            ddf = ddf.drop(columns=col)
        else:
            # downcast from 64bit to 32bit types
            # Tesla T4 are faster on 32bit ops
            if "int" in str(ddf[col].dtype):
                ddf[col] = ddf[col].astype("int32")
            if "float" in str(ddf[col].dtype):
                ddf[col] = ddf[col].astype("float32")
            ddf[col] = ddf[col].fillna(-1)

    return ddf


def prepare_data(client):
    taxi_df = dask_cudf.read_csv(
        "https://storage.googleapis.com/anaconda-public-data/nyc-taxi/csv/2016/yellow_tripdata_2016-02.csv",
        dtype=col_dtype,
    )
    taxi_df = taxi_df.map_partitions(clean, must_haves, meta=must_haves)

    ## add features
    taxi_df["hour"] = taxi_df["pickup_datetime"].dt.hour.astype("int32")
    taxi_df["year"] = taxi_df["pickup_datetime"].dt.year.astype("int32")
    taxi_df["month"] = taxi_df["pickup_datetime"].dt.month.astype("int32")
    taxi_df["day"] = taxi_df["pickup_datetime"].dt.day.astype("int32")
    taxi_df["day_of_week"] = taxi_df["pickup_datetime"].dt.weekday.astype("int32")
    taxi_df["is_weekend"] = (taxi_df["day_of_week"] >= 5).astype("int32")

    # calculate the time difference between dropoff and pickup.
    taxi_df["diff"] = taxi_df["dropoff_datetime"].astype("int32") - taxi_df[
        "pickup_datetime"
    ].astype("int32")
    taxi_df["diff"] = (taxi_df["diff"] / 1000).astype("int32")

    taxi_df["pickup_latitude_r"] = taxi_df["pickup_latitude"] // 0.01 * 0.01
    taxi_df["pickup_longitude_r"] = taxi_df["pickup_longitude"] // 0.01 * 0.01
    taxi_df["dropoff_latitude_r"] = taxi_df["dropoff_latitude"] // 0.01 * 0.01
    taxi_df["dropoff_longitude_r"] = taxi_df["dropoff_longitude"] // 0.01 * 0.01

    taxi_df = taxi_df.drop("pickup_datetime", axis=1)
    taxi_df = taxi_df.drop("dropoff_datetime", axis=1)

    taxi_df = taxi_df.map_partitions(compute_haversine_distance)

    X = (
        taxi_df.drop(["fare_amount"], axis=1)
        .astype("float32")
        .to_dask_array(lengths=True)
    )
    y = taxi_df["fare_amount"].astype("float32").to_dask_array(lengths=True)

    X._meta = cp.asarray(X._meta)
    y._meta = cp.asarray(y._meta)

    X, y = dask_utils.persist_across_workers(client, [X, y])
    return X, y

In [24]:
# Map each thread's integer ID to a sequential number (0, 1, 2 ...)
thread_id_map: dict[int, int] = {}
thread_id_map_lock = threading.Lock()

def get_seq_thread_id(thread_id: int) -> int:
    with thread_id_map_lock:
        try:
            return thread_id_map[thread_id]
        except KeyError:
            seq_id = len(thread_id_map)
            thread_id_map[thread_id] = seq_id
            return seq_id

In [37]:
def train_model(params):
    seq_thread_id = get_seq_thread_id(threading.get_ident())
    cluster = clusters[seq_thread_id]
    
    default_params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "verbosity": 0,
        "tree_method": "hist",
        "device": "cuda",
    }
    params = dict(default_params, **params)

    with Client(cluster) as client:
        X, y = prepare_data(client)
        wait([X, y])

        scores = []
        kfold = KFold(n_splits=5, shuffle=False)
        for train_index, test_index in kfold.split(X, y):
            dtrain = dxgb.DaskQuantileDMatrix(client, X[train_index, :], y[train_index])
            dtest = dxgb.DaskQuantileDMatrix(client, X[test_index, :], y[test_index])
            model = dxgb.train(
                client,
                params,
                dtrain,
                num_boost_round=10,
                verbose_eval=False,
            )
            y_test_pred = dxgb.predict(client, model, dtest).to_backend("cupy")
            rmse_score = mean_squared_error(y[test_index], y_test_pred, squared=False)
            scores.append(rmse_score)
        return sum(scores) / len(scores)

In [38]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 2, 4),
        "learning_rate": trial.suggest_float("learning_rate", 0.5, 0.7),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 6),
        "max_leaves": trial.suggest_int("max_leaves", 0, 2),
        "max_cat_to_onehot": trial.suggest_int("max_cat_to_onehot", 1, 10),
    }
    return train_model(params)

In [39]:
n_trials = 2
study = optuna.create_study(direction="minimize")

[I 2024-05-07 23:43:52,834] A new study created in memory with name: no-name-47f883d7-0e24-443f-850a-952c493ca8dd


In [40]:
study.optimize(objective, n_trials=n_trials, n_jobs=2)

[I 2024-05-07 23:44:41,419] Trial 0 finished with value: 54.916236877441406 and parameters: {'n_estimators': 3, 'learning_rate': 0.5415371133618251, 'colsample_bytree': 0.5913700906568089, 'colsample_bynode': 0.9339048054526411, 'colsample_bylevel': 0.9206873465790988, 'reg_lambda': 0.9481989574296305, 'max_depth': 5, 'max_leaves': 2, 'max_cat_to_onehot': 2}. Best is trial 0 with value: 54.916236877441406.
[I 2024-05-07 23:44:44,176] Trial 1 finished with value: 57.77985763549805 and parameters: {'n_estimators': 2, 'learning_rate': 0.6706371267894993, 'colsample_bytree': 0.8559021072260791, 'colsample_bynode': 0.9010703455290314, 'colsample_bylevel': 0.9731391560581295, 'reg_lambda': 0.4355000922792336, 'max_depth': 1, 'max_leaves': 1, 'max_cat_to_onehot': 8}. Best is trial 0 with value: 54.916236877441406.
