In [1]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask.dataframe as dd
import dask_cudf
import cudf
import numpy as np

import shutil
import time
import os

path = "dummy_dataset.parquet"
path_shuffled = "dummy_dataset_shuffled.parquet"

Environment variables with the 'NUMBAPRO' prefix are deprecated, found use of NUMBAPRO_NVVM=/usr/local/cuda-9.2/nvvm/lib64/libnvvm.so.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-numbapro-environment-variables
Environment variables with the 'NUMBAPRO' prefix are deprecated, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda-9.2/nvvm/libdevice.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-numbapro-environment-variables


In [2]:
cluster = LocalCUDACluster()
client = Client(cluster)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:35542  Dashboard: http://127.0.0.1:39074/status,Cluster  Workers: 8  Cores: 8  Memory: 540.95 GB


In [3]:
# Read back the parquet dataset on GPUs `read_parquet`

ts = time.time()
gddf_read = dask_cudf.read_parquet(path, index="timestamp", strings_to_categorical=True, gather_statistics=True)
time_read_meta_cudf = time.time() - ts
print("time_read_meta_cudf", time_read_meta_cudf)

time_read_meta_cudf 0.6961123943328857


In [4]:
gddf_read.divisions[0]

Timestamp('1900-12-31 00:00:00')

In [5]:
gddf_read.npartitions

24

In [6]:
# Preprocessing for DL

cat_names = ['name']
cont_names = ['value', 'id1000']

In [7]:
# Step #1 - Catigorify
# Note that this is currently accomplished by
# `strings_to_categorical` argument to `dask_cudf.read_parquet`

# Note that another alternative is to reduce the unique strings for each column
# and then use nvcategory.from_strings(my_nvstrings).set_keys(my_keys) to
# convert each column with a map_partitions call

In [8]:
# Step #2 - Fill NA/NaN
ts = time.time()
for col in cont_names:
    median = gddf_read[col].quantile(0.5).compute()
    if gddf_read[col].dtype in ('int64','int32'):
        median = int(median)
    gddf_read[col] = gddf_read[col].fillna(median)
time_fillna = time.time() - ts
print("time_fillna", time_fillna)

time_fillna 13.102049350738525


In [9]:
# Step #3 - Normalize
ts = time.time()
gdf_cont = gddf_read[cont_names]
means = gdf_cont.mean().compute()
stds = gdf_cont.std().compute()
for i, name in enumerate(cont_names):
    gddf_read[name] = (gddf_read[name]-means[i])/(1e-7+stds[i])
    gddf_read[name] = gddf_read[name].astype('float32')
time_normalize = time.time() - ts
print("time_normalize", time_normalize)

time_normalize 26.21594476699829


In [10]:
print(gddf_read.head())

        id1000         name        value
1900-12-31T00:00:00.000   -1.4230555    240152049  0.095782526
1900-12-31T00:00:01.000  -0.44275093   1785946901   -1.1133134
1900-12-31T00:00:02.000    -1.612792   1413041722  -0.56240535
1900-12-31T00:00:03.000   -1.1700737  -1573697422   0.55096644
1900-12-31T00:00:04.000  -0.88546914   1070281201   -1.4357302


In [11]:
# Lets add a new column to shuffle the dataset

def _assign_rand(df):
    return df.assign(sort_ind=np.random.permutation(len(df)))
gddf_read_new = gddf_read.map_partitions(_assign_rand)

In [12]:
# Set index to column `sort_ind` to shuffle the dataset

ts = time.time()
#gddf_read_shuffled = gddf_read_new.set_index('sort_ind')
gddf_read_shuffled = dd.shuffle.set_index(gddf_read_new, 'sort_ind')
time_set_index_sort_ind = time.time() - ts
print("time_set_index_sort_ind", time_set_index_sort_ind)

time_set_index_sort_ind 44.708784341812134


In [13]:
gddf_read_shuffled.divisions[0]

0

In [14]:
gddf_read_shuffled.npartitions

24

In [15]:
# # Show time for shuffle (Shouldn't actually do this step in practice)
#
# ts = time.time()
# gddf_read_shuffled.compute()
# time_shuffle = time.time() - ts
# print("time_shuffle:",time_shuffle)

In [None]:
# Write out "shuffled" dataset
# Total time is shuffle + write (write is currently SLOW here)

ts = time.time()
if os.path.isdir(path_shuffled):
    shutil.rmtree(path_shuffled)
time_clean = time.time() - ts
print("time_clean:",time_clean)

ts = time.time()
gddf_read_shuffled.to_parquet(path_shuffled, write_index=False, engine="pyarrow")
time_gen_and_write = time.time() - ts
print("time_gen_and_write:",time_gen_and_write)

time_clean: 2.4299046993255615


In [16]:
# Read back the "shuffled" parquet dataset
ts = time.time()
gddf_read_shuffled_2 = dask_cudf.read_parquet(path_shuffled, index=False, gather_statistics=True)
time_read_meta = time.time() - ts
print("time_read_meta", time_read_meta)

time_read_meta 0.07684016227722168


In [18]:
gddf_read_shuffled_2.compute().tail()

Unnamed: 0,id1000,name,value
36649,-0.601083,-997114637,-0.167866
36650,0.568863,240152049,0.685256
36651,-1.771028,793428781,-1.668564
36652,-0.316501,1159665297,0.322561
36653,-0.790803,1656922212,-1.700507


In [13]:
client.close()
cluster.close()