In [1]:
!nvidia-smi

Mon Mar 25 16:27:48 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.25       Driver Version: 418.25       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   32C    P0    44W / 300W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    44W / 300W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

In [2]:
import numpy as np; print('NumPy Version:', np.__version__)
import os
import pandas as pd; print('Pandas Version:', pd.__version__)

NumPy Version: 1.15.4
Pandas Version: 0.23.4


In [3]:
SIMULATE = True
n_partitions = 16
n_rows = 10000
n_features = 10
rows_per_partition = n_rows / n_partitions

In [4]:
if SIMULATE:
    categories = [8000] + [i + 2 for i in range(n_features - 1)]
    df = pd.DataFrame()
    df['y'] = np.random.normal(size=(n_rows))
    column_names = ['x' + str(i + 1) for i in range(n_features)]
    for n_categories, column_name in zip(categories, column_names):
        df[column_name] = np.random.randint(0, n_categories, size=(n_rows))
else:
    filename = 'foo.csv'
    data_path = '/data'
    df = pd.read_csv(os.path.join(data_path, filename))
prepped_path = os.path.join('.')

In [5]:
print(df.shape)
print(df.head())
print(df.columns)

(10000, 11)
          y    x1  x2  x3  x4  x5  x6  x7  x8  x9  x10
0 -3.279260   785   1   0   1   3   4   3   7   1    2
1  0.318339   809   1   0   3   3   0   2   6   6    0
2  0.219383   297   1   0   1   1   3   2   2   0    8
3  1.115959  2007   0   0   0   3   5   5   0   4    1
4 -0.948642  3858   0   1   2   2   1   3   1   3    2
Index(['y', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'], dtype='object')


In [6]:
for n_partition in range(n_partitions):
    #start_index = int(n_partition * rows_per_partition)
    #end_index = int((n_partition + 1) * rows_per_partition)
    #subset = df.iloc[start_index:end_index, :]
    subset = df
    filename = 'chunk-{}.csv'.format(n_partition)
    path = os.path.join(prepped_path, filename)
    print('Writing dataset of shape:', subset.shape, 'to:', path)
    subset.to_csv(path, index=False)

Writing dataset of shape: (10000, 11) to: ./chunk-0.csv
Writing dataset of shape: (10000, 11) to: ./chunk-1.csv
Writing dataset of shape: (10000, 11) to: ./chunk-2.csv
Writing dataset of shape: (10000, 11) to: ./chunk-3.csv
Writing dataset of shape: (10000, 11) to: ./chunk-4.csv
Writing dataset of shape: (10000, 11) to: ./chunk-5.csv
Writing dataset of shape: (10000, 11) to: ./chunk-6.csv
Writing dataset of shape: (10000, 11) to: ./chunk-7.csv
Writing dataset of shape: (10000, 11) to: ./chunk-8.csv
Writing dataset of shape: (10000, 11) to: ./chunk-9.csv
Writing dataset of shape: (10000, 11) to: ./chunk-10.csv
Writing dataset of shape: (10000, 11) to: ./chunk-11.csv
Writing dataset of shape: (10000, 11) to: ./chunk-12.csv
Writing dataset of shape: (10000, 11) to: ./chunk-13.csv
Writing dataset of shape: (10000, 11) to: ./chunk-14.csv
Writing dataset of shape: (10000, 11) to: ./chunk-15.csv


In [7]:
import cudf; print('cuDF Version:', cudf.__version__)
import dask; print('Dask Version:', dask.__version__)
import dask_cudf; print('Dask cuDF Version:', dask_cudf.__version__)
import dask_cuda  # ; print('Dask CUDA Version:', dask_cuda.__version__)
from dask_cuda import LocalCUDACluster
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_xgboost as dxgb_gpu; print('Dask XGBoost Version:', dxgb_gpu.__version__)
import time
import xgboost as xgb; print('XGBoost Version:', xgb.__version__)

cuDF Version: 0.6.0.dev0+1690.g3a8dc23d
Dask Version: 1.1.1
Dask cuDF Version: 0.6.0.dev0+1690.g3a8dc23d


  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


Dask XGBoost Version: 0.1.5
XGBoost Version: 0.81


In [8]:
cluster = LocalCUDACluster()
client = Client(cluster)

In [9]:
def read_csv(filepath):
    column_dtypes = ['float'] + n_features * ['int']
    column_names = ['y'] + ['x' + str(i + 1) for i in range(n_features)]
    gdf = cudf.read_csv(filepath, names=column_names, dtype=column_dtypes, skiprows=1)
    return gdf


def get_dummies(gdf):
    numerical_columns = ['y']
    output = gdf[numerical_columns]
    for column in numerical_columns:
        output[column] = output[column].astype('float32')
    
    categorical_columns = ['x' + str(i + 1) for i in range(n_features)]
    for column in categorical_columns:
        codes = gdf[column].unique()
        temp = gdf[[column]].one_hot_encoding(column, column, codes, prefix_sep='_', dtype='int8')
        del codes
        temp = temp.drop(column)
        output = cudf.multi.concat([output, temp], axis=1)
        del temp
    del gdf
    return output


def make_features(gdf):
    column_names = gdf.columns
    return gdf.loc[:, column_names[1:]]


def make_labels(gdf):
    column_names = gdf.columns
    return gdf.loc[:, column_names[:1]]

In [10]:
prepped_path = '.'
n_workers = n_partitions

# dask + cudf
filepaths = [os.path.join(prepped_path, 'chunk-{}.csv'.format(i)) for i in range(n_workers)]
gdfs = [delayed(read_csv)(filepath) for filepath in filepaths]

# dask_cudf
# pattern = 'chunk-*.csv'
# gdfs = dask_cudf.read_csv(os.path.join(prepped_path, pattern))

In [11]:
# dask + cudf
gdfs = [delayed(get_dummies)(gdf) for gdf in gdfs]

# dask_cudf
# categorical_columns = ['x' + str(i) for i range()]
# print(gdfs.one_hot_encoding('x1').compute())

In [12]:
# dask + cudf
gdfs = [[delayed(make_features)(gdf), delayed(make_labels)(gdf)] for gdf in gdfs]

In [13]:
# dask + cudf
gdfs = [delayed(xgb.DMatrix)(gdf[0], gdf[1]) for gdf in gdfs]

In [14]:
# results = client.compute(gdfs, optimize_graph=False, fifo_timeout="0ms")
# time.sleep(1)  # this will give Dask time to execute each worker
# output = [result.result() for result in results]
# print(output[0])

In [15]:
wait(gdfs)

DoneAndNotDoneFutures(done=set(), not_done=set())

In [16]:
dxgb_gpu_params = {
    'nround':            50,
    'max_depth':         8,
    'max_leaves':        2**8,
    'alpha':             0.9,
    'eta':               0.1,
    'gamma':             0.1,
    'learning_rate':     0.1,
    'subsample':         1,
    'reg_lambda':        1,
    'scale_pos_weight':  2,
    'min_child_weight':  30,
    'tree_method':       'gpu_hist',
    'n_gpus':            1,
    'distributed_dask':  True,
    'loss':              'ls',
    'objective':         'reg:linear',
    'max_features':      'auto',
    'criterion':         'friedman_mse',
    'grow_policy':       'lossguide',
    'verbose':           True
}

In [17]:
%%time


print('Starting to train...')
labels = None
bst = dxgb_gpu.train(client, dxgb_gpu_params, gdfs, labels, num_boost_round=dxgb_gpu_params['nround'])

Starting to train...
CPU times: user 5.52 s, sys: 1.23 s, total: 6.75 s
Wall time: 1min 34s


In [18]:
print(bst)

<xgboost.core.Booster object at 0x7f61002aecf8>
