In [None]:
import orchest
import coiled
from dask.distributed import Client

In [None]:
cluster_name = orchest.get_inputs()["cluster_name"]

In [None]:
cluster_name

In [None]:
cluster = coiled.Cluster(name=cluster_name)

In [None]:
client = Client(cluster)
print('Dashboard:', client.dashboard_link)

In [None]:
client.ncores()

In [None]:
import dask.dataframe as dd

# Load the entire dataset using Dask
ddf = dd.read_csv("s3://coiled-data/higgs/higgs-*.csv", storage_options={"anon": True})
ddf

In [None]:
from dask_ml.model_selection import train_test_split

X, y = ddf.iloc[:, 1:], ddf["labels"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=2)

In [None]:
from sklearn.metrics import auc, roc_curve

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import dask

X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)

X_train

In [None]:
import dask_xgboost

params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'min_child_weight': 0.5,
}

bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=3)

In [None]:
y_pred = dask_xgboost.predict(client, bst, X_test)
y_test, y_pred = dask.compute(y_test, y_pred)

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(fpr, tpr, lw=3,
        label='ROC Curve (area = {:.2f})'.format(auc(fpr, tpr)))
ax.plot([0, 1], [0, 1], "k--", lw=2)
ax.set(
    xlim=(0, 1),
    ylim=(0, 1),
    title="ROC Curve",
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
)
ax.legend()
plt.show()