In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.utils.testing import assert_array_equal
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import joblib

from dask_ml.linear_model import LinearRegression as DLinearRegression
from dask_ml.model_selection import GridSearchCV as DGridSearchCV
import dask.dataframe as dd
from dask.distributed import Client

In [2]:
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
y = pd.Series(iris.target)

In [3]:
# no parallelism
# model details
model = RandomForestClassifier(random_state=42, n_estimators=100)
# give (deep) parameter tuning details
parameters = {'max_depth': (2,5)}
# fit GridSearchCV
clf = GridSearchCV(model, parameters, cv=5)
%timeit clf.fit(X, y)

827 ms ± 23.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
# parallel random forest (too much overhead)
# model details
model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)
# give (deep) parameter tuning details
parameters = {'max_depth': (2,5)}
# fit GridSearchCV
clf = GridSearchCV(model, parameters, cv=5)
%timeit clf.fit(X, y)

3.51 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
# parallel gridsearch CV (works meaningfully fast)
# model details
model = RandomForestClassifier(random_state=42, n_estimators=100)
# give (deep) parameter tuning details
parameters = {'max_depth': (2,5)}
# fit GridSearchCV
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1)
%timeit clf.fit(X, y)

531 ms ± 22.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
# nested parallelism (works as expected)
# model details
model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)
# give (deep) parameter tuning details
parameters = {'max_depth': (2,5)}
# fit GridSearchCV
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1)
%timeit clf.fit(X, y)

1.49 s ± 54.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
# with purely dask datastructures and estimator wrappers
# custom data structure
dX = dd.from_pandas(X, npartitions=2)
dy = dd.from_pandas(y, npartitions=2)
# model details
model = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)
# give (deep) parameter tuning details
parameters = {'max_depth': (2,5)}
# fit GridSearchCV
clf = DGridSearchCV(model, parameters, cv=5, n_jobs=-1)
%timeit clf.fit(X, y)
# highlights the problems with dask and the WIP at sklearn. not all estimators 
# are dask-ready. But, this still gives the same nested parallelism time.

  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs =

  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs =

  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs =

  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs =

  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


1.53 s ± 92.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
