In [1]:
import pandas as pd
import numpy as np

from dask.distributed import Client, wait
import dask.dataframe as dd
import dask_cudf
import dask
import cudf

import shutil
import time
import os

path = "dummy_dataset.parquet"
random_seed = 42

Environment variables with the 'NUMBAPRO' prefix are deprecated, found use of NUMBAPRO_NVVM=/usr/local/cuda-9.2/nvvm/lib64/libnvvm.so.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-numbapro-environment-variables
Environment variables with the 'NUMBAPRO' prefix are deprecated, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda-9.2/nvvm/libdevice.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-numbapro-environment-variables


In [2]:
from dask.distributed import LocalCluster

cluster = LocalCluster()
client = Client(cluster)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:35895  Dashboard: http://127.0.0.1:34428/status,Cluster  Workers: 10  Cores: 80  Memory: 540.95 GB


In [3]:
# Start by creating a large dummy dataset
# using Dask's internal `timeseries` example

ts = time.time()
ddf = dask.datasets.timeseries(
    '1900', # Start year
    '2001', # End year
    freq='1M', # Note: Use '1S' to create a "huge" df (3152995200 rows)
    partition_freq='1Y',
    seed=random_seed,
    dtypes={
        'value': float,
        'name': str,
        'id10': int,
        'id100': int,
        'id1000': int,
    },  # data types
    id10_lam=10,  # control number of items in id column
    id100_lam=100,  # control number of items in id column
    id1000_lam=1000,  # control number of items in id column
)
time_create = time.time() - ts
print("time_create:",time_create)

time_create: 0.016566753387451172


In [4]:
ddf.npartitions

100

In [6]:
# Use Dask's to_parquet to write data to a pyarrow-parquet "dataset"

ts = time.time()
if os.path.isdir(path):
    shutil.rmtree(path)
time_clean = time.time() - ts
print("time_clean:",time_clean)

ts = time.time()
ddf.to_parquet(path, write_index=False, engine="pyarrow", append=True)
time_gen_and_write = time.time() - ts
print("time_gen_and_write:",time_gen_and_write)

time_clean: 0.08587431907653809
time_gen_and_write: 0.3599553108215332


In [7]:
# Read back the parquet dataset on CPU `read_parquet`

ts = time.time()
ddf_read = dd.read_parquet(path, gather_statistics=True)
time_read_meta = time.time() - ts
print("time_read_meta", time_read_meta)

time_read_meta 0.22771191596984863


In [8]:
# Get Mean value from column 'id100'

ts = time.time()
mean = ddf_read['id100'].mean()
mean = mean.compute()
time_get_mean_id100 = time.time() - ts
print("time_get_mean_id100", time_get_mean_id100)
print("mean", mean)

time_get_mean_id100 0.4230837821960449
mean 99.97333333333333


In [9]:
# Set index to column `id100`

ts = time.time()
df = ddf_read.set_index('id100').compute()
time_set_index_id100 = time.time() - ts
print("time_set_index_id100", time_set_index_id100)

time_set_index_id100 2.4437153339385986


In [10]:
df.head()

Unnamed: 0_level_0,id10,id1000,name,value
id100,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64,13,1028,Ursula,0.348264
69,6,1070,Wendy,0.587983
70,7,938,Alice,0.768869
72,13,1028,Quinn,-0.828167
72,7,982,Kevin,-0.565578


In [11]:
client.close()
cluster.close()

In [2]:
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster()
client = Client(cluster)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:40857  Dashboard: http://127.0.0.1:41468/status,Cluster  Workers: 8  Cores: 8  Memory: 540.95 GB


In [3]:
# Read back the parquet dataset on GPUs `read_parquet`

ts = time.time()
gddf_read = dask_cudf.read_parquet(path, index="index", gather_statistics=True)
time_read_meta_cudf = time.time() - ts
print("time_read_meta_cudf", time_read_meta_cudf)

time_read_meta_cudf 1.1809210777282715


In [4]:
# Get Mean value from column 'id100'

ts = time.time()
mean = gddf_read['id100'].mean()
mean = mean.compute()
time_get_mean_id100_cudf = time.time() - ts
print("time_get_mean_id100_cudf", time_get_mean_id100_cudf)
print("mean", mean)

time_get_mean_id100_cudf 2.8839406967163086
mean 99.97333333333333


In [5]:
# # # ORIGINAL dask_cudf VERSION: Set index to column `id100`
# # # !! SLOW !!

# # ts = time.time()
# # gdf = gddf_read.set_index('id100').compute()
# # time_set_index_id100_cudf = time.time() - ts
# # print("time_set_index_id100_cudf", time_set_index_id100_cudf)

# # NEW VERSION: Set index to column `id100`

# ts = time.time()
# gdf = dd.shuffle.set_index(gddf_read, 'id100').compute()
# time_set_index_id100_cudf = time.time() - ts
# print("time_set_index_id100_cudf", time_set_index_id100_cudf)

In [6]:
# print(gdf.head())
print(gddf_read.head())

   id10  id100  id1000    name                value
0     9     97    1023   Kevin  0.05529776499610839
1     8     98    1014   Quinn  -0.6427707370865254
2     9    111     997   Frank  -0.3247053311901895
3     8     97    1051   Edith  0.31809705969283963
4     9    100     997  Ursula  -0.8289172566938443


In [7]:
# Preprocess the dataframe on the GPUs
from preproc import *

to_cpu = False
target = 'id10'
cat_names = ['name']
cont_names = ['value', 'id1000']
proc = PreprocessDF(cat_names=cat_names, cont_names=cont_names, label_name=target, to_cpu=to_cpu)

In [8]:
#print(gddf_read['name'].compute().hash_values().unique())

In [9]:
#print(gddf_read['name'].unique().compute())

In [10]:
#len(gddf_read['name'])

In [11]:
# x, y = proc.preproc_dataframe(gddf_read, mode='train')
gddf_read_cats = proc.preproc_dataframe(dd.shuffle.set_index(gddf_read, 'id100'), mode='train')

TypeError: unhashable type: 'list'

In [12]:
print(gddf_read_cats.head())

   id10  id1000         name                value
64    13    1028   1805436790   0.3482640911798984
69     6    1070  -1142878741   0.5879834987378125
70     7     938  -1046842473   0.7688688828235088
72    13    1028  -1538645396  -0.8281673384596697
72     7     982    249248448  -0.5655781793142831


In [20]:
print(gddf_read_cats['name'].compute(scheduler='single-threaded'))

64    11
69    13
70     1
72    10
72     5
73    14
73     2
74     6
74     4
74     4
[1190 more rows]
Name: name, dtype: int64


In [13]:
print(gddf_read_cats['name'].compute().unique())

0   -2023749819
1   -1723958038
2   -1708607005
3   -1538645396
4   -1490803936
5   -1403833033
6   -1373745025
7   -1142878741
8   -1088933377
9   -1046842473
[16 more rows]
Name: name, dtype: int64


In [13]:
client.close()
cluster.close()