In [1]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask_cudf
import dask
import cudf
import glob


column_count = 40
num_files = None # Set to None to read all files
data_path = "/datasets/criteo/raw_csvs/split_train_data"

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda-10.1/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda-10.1/nvvm/libdevice.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


### Create a cluster of GPU workers

In [2]:
cluster = LocalCUDACluster(device_memory_limit="32GB")
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:46379  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 1.08 TB


### Read in Criteo Dataset, and Calculate Statistics

In [6]:
files = glob.glob(data_path + "/*")
if num_files and num_files < len(files):
    files = files[:num_files]

names = ["Label"] + ["I"+str(i+1) for i in range(13)] + ["C"+str(i+1) for i in range(26)]
dtype = ["int64"] + ["int64" for i in range(13)] + ["str" for i in range(26)]
ddf = dask_cudf.read_csv(files, delimiter="\t", dtype=dtype, names=names)

First, check the first few rows

In [7]:
ddf.head()

Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,1,5.0,110,,16.0,,1.0,0,14,7,...,d20856aa,b8170bba,9512c20b,c38e2f28,14f65a5d,25b1b089,d7c1fc0b,7caf609c,30436bfc,ed10571d
1,0,32.0,3,5.0,,1.0,0.0,0,61,5,...,d20856aa,a1eb1511,9512c20b,febfd863,a3323ca1,c8e1ee56,1752e9e8,75350c8a,991321ea,b757e957
2,0,,233,1.0,146.0,1.0,0.0,0,99,7,...,d20856aa,628f1b8d,9512c20b,c38e2f28,14f65a5d,25b1b089,d7c1fc0b,34a9b905,ff654802,ed10571d
3,0,,24,,11.0,24.0,,0,56,3,...,1f7fc70b,a1eb1511,9512c20b,,,,dc209cd3,b8a81fb0,30436bfc,b757e957
4,0,60.0,223,6.0,15.0,5.0,0.0,0,1,8,...,d20856aa,d9f758ff,9512c20b,c709ec07,2b07677e,a89a92a5,aa137169,e619743b,cdc3217e,ed10571d


Specify the continuous and categorical columns

In [8]:
cont_cols = ["I"+str(i+1) for i in range(13)]
cat_cols = ["C"+str(i+1) for i in range(26)]

**Time-Consuming Part:** Compute the statistics over the entire dataset (using `ddf.describe`)

In [None]:
stats = ddf[cont_cols].describe(percentiles=[0.5])
%time computed_stats = dask.compute(stats)[0]



In [11]:
median = computed_stats.loc["50%"].iloc[1:]
mean = computed_stats.loc["mean"].iloc[0]
std = computed_stats.loc["std"].iloc[0]
count = computed_stats.loc["count"].iloc[0]
mins = computed_stats.loc["min"].iloc[0]
maxes = computed_stats.loc["max"].iloc[0]

In [14]:
median

column0        0.0
column1       10.0
column10       0.0
column11       2.0
column12    6299.0
column13       5.0
column2      225.0
column3        4.0
column4       39.0
column5        6.0
column6        0.0
column7        0.0
column8        9.0
column9        7.0
Name: 50%, dtype: object

In [15]:
mean

column0         0.033051
column1        34.387808
column2       417.565493
column3         7.196818
column4       127.777254
column5        22.602954
column6         1.719371
column7         0.161513
column8       112.786085
column9         9.704844
column10        0.298463
column11        4.119612
column12    21324.027288
column13        8.947549
dtype: float64

In [16]:
std

column0         0.178770
column1       460.496396
column2       689.674459
column3         9.828282
column4       615.158648
column5        81.441259
column6        21.590572
column7         2.059840
column8       391.429545
column9        16.212257
column10        0.569462
column11        7.121515
column12    75523.126147
column13       22.055711
dtype: float64

In [17]:
count

column0     4195197692
column1     3408640361
column2     3779087859
column3     3162206758
column4     2748262732
column5     2484922855
column6     3829411492
column7     4073075259
column8     4195197692
column9     4195197691
column10    3829411492
column11    2484922855
column12    4124955977
column13    3162206758
dtype: int64

In [18]:
mins

column0     0.0
column1     1.0
column2     1.0
column3     0.0
column4     0.0
column5     1.0
column6     0.0
column7     0.0
column8    -1.0
column9     0.0
column10    0.0
column11    1.0
column12    0.0
column13    0.0
dtype: float64

In [19]:
maxes

column0            1.0
column1        65535.0
column2         8000.0
column3         4933.0
column4       746810.0
column5         8000.0
column6        69472.0
column7         5277.0
column8       400947.0
column9        16050.0
column10          21.0
column11         566.0
column12    35081696.0
column13       21843.0
dtype: float64