In order to run this notebook, you will first need to run a dask scheduler and number of dask workers:
- Run a dask scheduler with:  ```dask-scheduler --scheduler-file=cluster.json```
- Run N dask workers with:  ```mpirun -np N dask-mpi --no-nanny --nthreads 10 --no-scheduler --scheduler-file cluster.json```

In [1]:
from dask.distributed import Client


In [2]:
# Run this if you want to run a cluster internally

# from dask_cuda import LocalCUDACluster
# cluster = LocalCUDACluster(threads_per_worker = 1)

# client = Client(cluster)
# client

In [3]:
# Run this if you are using an MPI-based cluster
client = Client(scheduler_file="cluster.json")

In [4]:

devs = [0, 1]
workers = list(client.has_what().keys())
worker_devs = workers[0:min(len(devs), len(workers))]

In [5]:
def set_visible(i, n):
    import os
    all_devices = list(range(n))
    vd = ",".join(map(str, all_devices[i:] + all_devices[:i]))
    print(str(vd))
    print("Selecting Device : "  + str(i))
    os.environ["CUDA_VISIBLE_DEVICES"] = vd
    
    import numba.cuda
    print("Cur device: " + str(numba.cuda.get_current_device().id))
    
dev_assigned = [client.submit(set_visible, dev, len(devs), workers = [worker]) for dev, worker in zip(devs, worker_devs)]

In [6]:
import cudf
import dask_cudf
import numba.cuda
from dask_cuml.linear_regression import LinearRegression

In [7]:
print(str(dev_assigned))

[<Future: status: finished, type: NoneType, key: set_visible-8e4386533916266bf607bb3518c945d4>, <Future: status: finished, type: NoneType, key: set_visible-2202c4a38e62dd9f3d1ca93dd67fd403>]


In [8]:
import pandas as pd

X = cudf.DataFrame([('a', [0, 1, 2, 3, 4])])
y = cudf.Series([0, 1, 2, 3, 4])


In [9]:
X_df = dask_cudf.from_cudf(X, chunksize=1).persist()
y_df = dask_cudf.from_cudf(y, chunksize=1).persist()

In [10]:
client.who_has()

{"('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 3)": ('tcp://10.2.166.167:37438',),
 "('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 0)": ('tcp://10.2.166.167:37438',
  'tcp://10.2.166.167:34338'),
 "('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 1)": ('tcp://10.2.166.167:34338',),
 "('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 2)": ('tcp://10.2.166.167:37438',
  'tcp://10.2.166.167:34338'),
 'set_visible-8e4386533916266bf607bb3518c945d4': ('tcp://10.2.166.167:34338',),
 'set_visible-2202c4a38e62dd9f3d1ca93dd67fd403': ('tcp://10.2.166.167:37438',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 3)": ('tcp://10.2.166.167:37438',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 0)": ('tcp://10.2.166.167:34338',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 2)": ('tcp://10.2.166.167:34338',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 1)": ('tcp://10.2.166.167:37438',),
 "('from_cudf-5e3cf33ba2814744b81659876e8733d4', 0)": (),
 "('from_cudf-5e3cf33ba2814744b81659876e8733

In [11]:
import numba.cuda
import cuml

def print_device(arr):
    
    import os
    dev = arr.compute().as_gpu_matrix(order="F")
    
    print("CUDA_VISIBLE_DEVICES: "+ str(os.environ["CUDA_VISIBLE_DEVICES"]))
    print("ARRAY: "+ str(dev))
    print("CUR DEVICE: " + str(numba.cuda.get_current_device().id))
    print("Pointer Device: "  + str(cuml.device_of_ptr(dev)))
    
[client.submit(print_device, part) for part in X_df.to_delayed()]

[<Future: status: pending, key: print_device-8b5a7dcb5465332af4a16d6856dfea63>,
 <Future: status: pending, key: print_device-b220cbbf33113d7480cd677025796438>,
 <Future: status: pending, key: print_device-ab1d7384a28fa063d207db424ee22a27>,
 <Future: status: pending, key: print_device-fb64a22a131560684e3b7cfd435436b2>]

Set each worker to host dfs on a different device. 

__Note__: You can ignore this if you started your workers with "CUDA_VISIBLE_DEVICE" already

In [12]:
client.who_has()

{"('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 3)": ('tcp://10.2.166.167:37438',),
 "('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 0)": ('tcp://10.2.166.167:37438',
  'tcp://10.2.166.167:34338'),
 "('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 1)": ('tcp://10.2.166.167:34338',),
 "('from_cudf-bdff18dec26c49c8ba3174bf4754d1e9', 2)": ('tcp://10.2.166.167:37438',
  'tcp://10.2.166.167:34338'),
 'set_visible-8e4386533916266bf607bb3518c945d4': ('tcp://10.2.166.167:34338',),
 'set_visible-2202c4a38e62dd9f3d1ca93dd67fd403': ('tcp://10.2.166.167:37438',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 3)": ('tcp://10.2.166.167:37438',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 0)": ('tcp://10.2.166.167:34338',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 2)": ('tcp://10.2.166.167:34338',),
 "('from_cudf-751bbd7c13a7491ea1843aaec30aada5', 1)": ('tcp://10.2.166.167:37438',),
 "('from_cudf-5e3cf33ba2814744b81659876e8733d4', 0)": ('tcp://10.2.166.167:34338',),
 "('from_cudf-5e3

In [13]:
lr = LinearRegression()

In [14]:
res = lr.fit(X_df, y_df)

input_devarrays: [(('10.2.166.167', 37438), <Future: status: finished, type: tuple, key: inputs_to_device_arrays-21892f99bd54160d77ffdd9a9c0d3a37>), (('10.2.166.167', 34338), <Future: status: finished, type: tuple, key: inputs_to_device_arrays-18642eed9eb3ae3f19d18b73cf62e915>)]
exec_node: ('10.2.166.167', 37438)
ipc_handles: [<Future: status: pending, key: get_input_ipc_handles-f271d9e7753aae5871793f1c3ca19651>]
raw_arrays: [<Future: status: finished, type: tuple, key: inputs_to_device_arrays-21892f99bd54160d77ffdd9a9c0d3a37>]
COEFS: (('10.2.166.167', 37438), <Future: status: pending, key: extract_part-38d55b7c20d766af6a21c55b987097ca>)
INTER: <Future: status: pending, key: extract_part-70391139f059fd10cc685d5b7f4ed172>
RES: <Future: status: pending, key: extract_part-70391139f059fd10cc685d5b7f4ed172>


res

In [15]:
lr.intercept_.result()

5

In [16]:
client.who_has()

{"('from_cudf-22fc16f999e94fb0963f7d2862829bea', 0)": ('tcp://10.2.166.167:35885',),
 "('from_cudf-22fc16f999e94fb0963f7d2862829bea', 1)": ('tcp://10.2.166.167:35885',),
 "('from_cudf-22fc16f999e94fb0963f7d2862829bea', 2)": ('tcp://10.2.166.167:35885',),
 "('from_cudf-22fc16f999e94fb0963f7d2862829bea', 3)": ('tcp://10.2.166.167:35885',),
 'set_visible-8e4386533916266bf607bb3518c945d4': ('tcp://10.2.166.167:35885',),
 'set_visible-2202c4a38e62dd9f3d1ca93dd67fd403': ('tcp://10.2.166.167:39866',),
 "('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 0)": ('tcp://10.2.166.167:35885',),
 "('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 2)": ('tcp://10.2.166.167:35885',),
 "('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 1)": ('tcp://10.2.166.167:39866',),
 "('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 3)": ('tcp://10.2.166.167:39866',),
 "('from_cudf-c28c2f1d82444d8bba560c39f8be3bef', 3)": ('tcp://10.2.166.167:39866',),
 "('from_cudf-c28c2f1d82444d8bba560c39f8be3bef', 2)": ('tcp://10.2.166.167:

In [17]:
g = lr.predict(X_df)

WORKER PARTS: [(('10.2.166.167', 35885), <Future: status: finished, type: DataFrame, key: ('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 0)>), (('10.2.166.167', 39866), <Future: status: finished, type: DataFrame, key: ('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 3)>), (('10.2.166.167', 39866), <Future: status: finished, type: DataFrame, key: ('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 1)>), (('10.2.166.167', 35885), <Future: status: finished, type: DataFrame, key: ('from_cudf-e0beaddeee4f4d4cb846ea01674b80c7', 2)>)]
ON WORKER: 2
NOT ON WORKER: 2
IPCHANDLES = [<Future: status: pending, key: get_ipc_handles-a288dc3638b6e9696be564cf0f6931f4>, <Future: status: pending, key: get_ipc_handles-1c3a3ed10dd420ffb8abcf6d9146ce8e>]
RAW_ARRAYS=[<Future: status: pending, key: as_gpu_matrix-dd5400ec92937f042fc946605823e45e>, <Future: status: pending, key: as_gpu_matrix-02df0c8a01ff23c4ba639e1a134b01cc>]
f=<Future: status: finished, type: tuple, key: _predict_on_worker-e6d346f2546c817050614970f9f

In [18]:
print(str(g.result()))

      
0    1
1    2
2    3
3    4
4    5
