In [None]:
from dask.distributed import Client

client = Client()
client

In [None]:
from sklearn.datasets import make_regression
import dask.array as da
import pandas as pd
import numpy as np

n_features = 100
random_state = 123

X, y = make_regression(
    random_state=random_state,
    n_samples=10_000,
    n_features=n_features,
    n_informative=n_features // 2
)
df = pd.DataFrame(X)
df.columns = [f"var{i}" for i in range(n_features)]
df["target"] = y
np.random.seed(random_state)

dX = da.from_array(X, chunks=X.shape[0]//4)
dy = da.from_array(y, chunks=y.shape[0]//4)

In [None]:
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
import joblib

reg = xgb.XGBRegressor()

param_grid = {
    "max_depth": [5, 10],
    "min_child_weight": [10, 20],
    "learning_rate": [0.05]
}

scoring = {
    "rmse": lambda est, X, y: max(est.evals_result_["validation_0"]["rmse"])
}

grid = GridSearchCV(reg, param_grid, scoring=scoring, error_score='raise', cv=2, refit="rmse", n_jobs=-1, verbose=1)

In [None]:
%%time
with joblib.parallel_backend("dask"):
    grid.fit(X, y, eval_set=[(X, y)], eval_metric="rmse")

In [None]:
dreg = xgb.dask.DaskXGBRegressor(eval_metric="rmse")
dreg.client = client
dgrid = GridSearchCV(dreg, param_grid, scoring=scoring, error_score='raise', cv=2, refit="rmse", n_jobs=1, verbose=1)

In [None]:
%%time
dgrid.fit(dX, dy, eval_set=[(dX, dy)])