## Load libraries

In [None]:
!nvidia-smi

In [None]:
import cudf
import dask.dataframe as dd
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_xgboost as dxgb_gpu
import numpy as np
import os
import pandas as pd
from pandas.util.testing import assert_frame_equal
import subprocess
import time
import xgboost as xgb

## Setup Dask

In [None]:
# worker settings
n_workers = 8
scheduler_ip = !hostname --all-ip-addresses
scheduler_ip = scheduler_ip[0].split()[0]
scheduler_port = '8786'
scheduler_uri = scheduler_ip + ':' +  scheduler_port
print(scheduler_uri)

In [None]:
# dask environment settings
dask_env = os.environ.copy()
dask_env['NCCL_P2P_DISABLE'] = '1'
dask_env['DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING'] = 'False'
dask_env['DASK_DISTRIBUTED__SCHEDULER__BANDWIDTH'] = '1'

In [None]:
# start the scheduler? unknown TODO find out
subprocess.Popen('dask-scheduler', env = dask_env)

In [None]:
# shutdown existing Dask processes
client = Client(scheduler_uri)
client.retire_workers()

In [None]:
# show current Dask status
client

In [None]:
# create list of arguments to pass to dask-worker
argument_list = ['--no-nanny', '--nprocs=1', '--nthreads=1', '--memory-limit=0', '--host=' + scheduler_ip]

In [None]:
for worker_id in range(n_workers):
    dask_env['CUDA_VISIBLE_DEVICES'] = str(worker_id)
    subprocess.Popen(['dask-worker', scheduler_uri] + argument_list, env=dask_env)
time.sleep(3)  # this will give Dask time to setup each worker

In [None]:
# show current Dask status
client

## Hello World

In [None]:
def initialize_rmm_pool():
    from librmm_cffi import librmm_config as rmm_cfg

    rmm_cfg.use_pool_allocator = True
    #rmm_cfg.initial_pool_size = 2<<30 # set to 2GiB. Default is 1/2 total GPU memory
    import cudf
    return cudf._gdf.rmm_initialize()

def initialize_rmm_no_pool():
    from librmm_cffi import librmm_config as rmm_cfg
    
    rmm_cfg.use_pool_allocator = False
    import cudf
    return cudf._gdf.rmm_initialize()

In [None]:
client.run(initialize_rmm_pool)

In [None]:
def read_csv(filepath):
    n_columns = 101
    dtypes = ['float'] * n_columns
    names = ['fea{}'.format(i) for i in range(n_columns)]
    gdf = cudf.io.csv.read_csv(filepath, names=names, dtype=dtypes, skiprows=1)
    return gdf


def make_features(gdf):
    column_names = gdf.columns
    return gdf.loc[:, column_names[1:]]


def make_labels(gdf):
    column_names = gdf.columns
    return gdf.loc[:, column_names[:1]]

In [None]:
base_path = '/tmp/datasets'
filepaths = [os.path.join(base_path, 'dataset-{}.csv'.format(i)) for i in range(n_workers)]

In [None]:
gdfs = [delayed(read_csv)(filepath) for filepath in filepaths]

In [None]:
# results = client.compute(gdfs, optimize_graph=False, fifo_timeout="0ms")
# results
# final = [result.result() for result in results]
# final[0]
# print(final[0].head())

In [None]:
client.run(cudf._gdf.rmm_finalize)

In [None]:
client.run(initialize_rmm_no_pool)

In [None]:
gpu_dfs = [[delayed(make_features)(gdf), delayed(make_labels)(gdf)] for gdf in gdfs]

In [None]:
gpu_dfs = [delayed(xgb.DMatrix)(gpu_df[0], gpu_df[1]) for gpu_df in gpu_dfs]
gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs]

In [None]:
wait(gpu_dfs)

## Execute XGBoost

In [None]:
dxgb_gpu_params = {
    'nround':            1000,
    'max_depth':         8,
    'max_leaves':        2**8,
    'alpha':             0.9,
    'eta':               0.1,
    'gamma':             0.1,
    'learning_rate':     0.1,
    'subsample':         1,
    'reg_lambda':        1,
    'scale_pos_weight':  2,
    'min_child_weight':  30,
    'tree_method':       'gpu_hist',
    'n_gpus':            1,
    'distributed_dask':  True,
    'loss':              'ls',
    'objective':         'gpu:reg:linear',
    'max_features':      'auto',
    'criterion':         'friedman_mse',
    'grow_policy':       'lossguide',
    'verbose':           True
}

In [None]:
%%time
labels = None
bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround'])

In [None]:
bst