## Load libraries

In [1]:
import cudf
import dask.dataframe as dd
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_xgboost as dxgb_gpu
import numpy as np
import os
import pandas as pd
from pandas.util.testing import assert_frame_equal
import subprocess
import time
import xgboost as xgb

## Setup Dask

In [2]:
# worker settings
n_workers = 4
scheduler_ip = !hostname --all-ip-addresses
scheduler_ip = scheduler_ip[0].split()[0]
scheduler_port = '8786'
scheduler_uri = scheduler_ip + ':' +  scheduler_port
print(scheduler_uri)

192.168.99.2:8786


In [3]:
# dask environment settings
dask_env = os.environ.copy()
dask_env['NCCL_P2P_DISABLE'] = '1'
dask_env['DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING'] = 'False'
dask_env['DASK_DISTRIBUTED__SCHEDULER__BANDWIDTH'] = '1'

In [4]:
# start the scheduler? unknown TODO find out
subprocess.Popen('dask-scheduler', env = dask_env)

<subprocess.Popen at 0x7f0e6c3939e8>

In [5]:
# shutdown existing Dask processes
client = Client(scheduler_uri)
client.retire_workers()

{'tcp://192.168.99.2:35744': {'host': '192.168.99.2',
  'id': 'tcp://192.168.99.2:35744',
  'last_seen': 1544825425.9600325,
  'local_directory': '/rapids/notebooks/dask-worker-space/worker-_ouq8x32',
  'memory_limit': 0,
  'metrics': {'cpu': 2.0,
   'executing': 0,
   'in_flight': 0,
   'in_memory': 0,
   'memory': 602980352,
   'num_fds': 39,
   'read_bytes': 19424.6757326651,
   'ready': 0,
   'time': 1544825425.4596932,
   'write_bytes': 25170.566512850288},
  'name': 'tcp://192.168.99.2:35744',
  'ncores': 1,
  'resources': {},
  'services': {'bokeh': 32970},
  'type': 'Worker'},
 'tcp://192.168.99.2:36940': {'host': '192.168.99.2',
  'id': 'tcp://192.168.99.2:36940',
  'last_seen': 1544825425.9610445,
  'local_directory': '/rapids/notebooks/dask-worker-space/worker-l05ue99f',
  'memory_limit': 0,
  'metrics': {'cpu': 2.0,
   'executing': 0,
   'in_flight': 0,
   'in_memory': 0,
   'memory': 602898432,
   'num_fds': 39,
   'read_bytes': 19774.105826528783,
   'ready': 0,
   'time'

In [6]:
# show current Dask status
client

0,1
Client  Scheduler: tcp://192.168.99.2:8786  Dashboard: http://192.168.99.2:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [7]:
# create list of arguments to pass to dask-worker
argument_list = ['--no-nanny', '--nprocs=1', '--nthreads=1', '--memory-limit=0', '--host=' + scheduler_ip]

In [8]:
for worker_id in range(n_workers):
    dask_env['CUDA_VISIBLE_DEVICES'] = str(worker_id)
    subprocess.Popen(['dask-worker', scheduler_uri] + argument_list, env=dask_env)
time.sleep(3)  # this will give Dask time to setup each worker

In [9]:
# show current Dask status
client

0,1
Client  Scheduler: tcp://192.168.99.2:8786  Dashboard: http://192.168.99.2:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 0 B


## Hello World

In [10]:
def initialize_rmm_pool():
    from librmm_cffi import librmm_config as rmm_cfg

    rmm_cfg.use_pool_allocator = True
    #rmm_cfg.initial_pool_size = 2<<30 # set to 2GiB. Default is 1/2 total GPU memory
    import cudf
    return cudf._gdf.rmm_initialize()

def initialize_rmm_no_pool():
    from librmm_cffi import librmm_config as rmm_cfg
    
    rmm_cfg.use_pool_allocator = False
    import cudf
    return cudf._gdf.rmm_initialize()

In [11]:
client.run(initialize_rmm_pool)

{'tcp://192.168.99.2:35684': True,
 'tcp://192.168.99.2:38336': True,
 'tcp://192.168.99.2:40793': True,
 'tcp://192.168.99.2:42230': True}

In [12]:
def make_pandas():
    X = np.random.randint(2, size=(10000, 101)).astype(np.float32)
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})
    return df

In [13]:
def make_cudf(df):
    gdf = cudf.DataFrame.from_pandas(df)
    return gdf

In [14]:
def make_features(gdf):
    column_names = gdf.columns
    return gdf.loc[:, column_names[1:]]


def make_labels(gdf):
    column_names = gdf.columns
    return gdf.loc[:, column_names[:1]]

In [15]:
dfs = [delayed(make_pandas)() for _ in range(n_workers)]

In [16]:
gdfs = [delayed(make_cudf)(df) for df in dfs]

In [17]:
client.run(cudf._gdf.rmm_finalize)

{'tcp://192.168.99.2:35684': True,
 'tcp://192.168.99.2:38336': True,
 'tcp://192.168.99.2:40793': True,
 'tcp://192.168.99.2:42230': True}

In [18]:
client.run(initialize_rmm_no_pool)

{'tcp://192.168.99.2:35684': True,
 'tcp://192.168.99.2:38336': True,
 'tcp://192.168.99.2:40793': True,
 'tcp://192.168.99.2:42230': True}

In [19]:
gpu_dfs = [[delayed(make_features)(gdf), delayed(make_labels)(gdf)] for gdf in gdfs]

In [20]:
gpu_dfs = [delayed(xgb.DMatrix)(gpu_df[0], gpu_df[1]) for gpu_df in gpu_dfs]
gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs]

In [21]:
wait(gpu_dfs)

DoneAndNotDoneFutures(done={<Future: status: finished, type: DMatrix, key: DMatrix-9c109ebb-59a8-46e8-b926-f135b23673b4>, <Future: status: finished, type: DMatrix, key: DMatrix-24012372-8531-41f3-984b-fe4547c7cdb4>, <Future: status: finished, type: DMatrix, key: DMatrix-190283b7-9b3e-459c-aad8-5d36ae734cc7>, <Future: status: finished, type: DMatrix, key: DMatrix-2e53450b-966c-4532-b4b9-5e2868d43234>}, not_done=set())

In [22]:
dxgb_gpu_params = {
    'nround':            1000,
    'max_depth':         8,
    'max_leaves':        2**8,
    'alpha':             0.9,
    'eta':               0.1,
    'gamma':             0.1,
    'learning_rate':     0.1,
    'subsample':         1,
    'reg_lambda':        1,
    'scale_pos_weight':  2,
    'min_child_weight':  30,
    'tree_method':       'gpu_hist',
    'n_gpus':            1,
    'distributed_dask':  True,
    'loss':              'ls',
    'objective':         'gpu:reg:linear',
    'max_features':      'auto',
    'criterion':         'friedman_mse',
    'grow_policy':       'lossguide',
    'verbose':           True
}

In [23]:
%%time
labels = None
bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround'])

CPU times: user 162 ms, sys: 382 ms, total: 544 ms
Wall time: 53.6 s
