In [1]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask_cudf
import dask
import cudf
import glob


use_agg = True
if use_agg:
    data_path = "/datasets/criteo/raw_csvs/split_train_data_parquet_agg"
else:
    data_path = "/datasets/criteo/raw_csvs/split_train_data_parquet"

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda-10.1/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda-10.1/nvvm/libdevice.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


### Create a cluster of GPU workers

In [2]:
cluster = LocalCUDACluster(device_memory_limit="32GB")
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:37183  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 1.08 TB


### Read in Criteo Dataset, and Calculate Statistics

In [3]:
chunksize=None
if use_agg:
    chunksize="250 MB"
%time ddf = dask_cudf.read_parquet(data_path, chunksize=chunksize, gather_statistics=True)

CPU times: user 22.4 s, sys: 1.86 s, total: 24.2 s
Wall time: 22.2 s


**Non-Aggregated:**
```
CPU times: user 35.8 s, sys: 2.28 s, total: 38.1 s
Wall time: 34.2 s
```

First, check the first few rows

In [4]:
%time ddf.head()

CPU times: user 252 ms, sys: 61.4 ms, total: 314 ms
Wall time: 4.08 s


Unnamed: 0_level_0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,5.0,110,,16.0,,1.0,0,14,7,...,d20856aa,b8170bba,9512c20b,c38e2f28,14f65a5d,25b1b089,d7c1fc0b,7caf609c,30436bfc,ed10571d
1,0,32.0,3,5.0,,1.0,0.0,0,61,5,...,d20856aa,a1eb1511,9512c20b,febfd863,a3323ca1,c8e1ee56,1752e9e8,75350c8a,991321ea,b757e957
2,0,,233,1.0,146.0,1.0,0.0,0,99,7,...,d20856aa,628f1b8d,9512c20b,c38e2f28,14f65a5d,25b1b089,d7c1fc0b,34a9b905,ff654802,ed10571d
3,0,,24,,11.0,24.0,,0,56,3,...,1f7fc70b,a1eb1511,9512c20b,,,,dc209cd3,b8a81fb0,30436bfc,b757e957
4,0,60.0,223,6.0,15.0,5.0,0.0,0,1,8,...,d20856aa,d9f758ff,9512c20b,c709ec07,2b07677e,a89a92a5,aa137169,e619743b,cdc3217e,ed10571d


Specify the continuous and categorical columns

In [5]:
cont_cols = ["I"+str(i+1) for i in range(13)]
cat_cols = ["C"+str(i+1) for i in range(26)]

**Time-Consuming Part:** Compute the statistics over the entire dataset (using `ddf.describe`)

In [6]:
stats = ddf[cont_cols].describe(percentiles=[0.5])
%time computed_stats = dask.compute(stats)[0]



CPU times: user 3min 53s, sys: 20.4 s, total: 4min 13s
Wall time: 21min 27s


**Non-Aggregated** Result:
```
CPU times: user 8min 47s, sys: 33.9 s, total: 9min 21s
Wall time: 30min
```

In [7]:
median = computed_stats.loc["50%"].iloc[1:]
mean = computed_stats.loc["mean"].iloc[0]
std = computed_stats.loc["std"].iloc[0]
count = computed_stats.loc["count"].iloc[0]
mins = computed_stats.loc["min"].iloc[0]
maxes = computed_stats.loc["max"].iloc[0]

In [8]:
median

I1       10.0
I10       0.0
I11       2.0
I12    4194.0
I13       5.0
I2      224.0
I3        4.0
I4       39.0
I5        6.0
I6        0.0
I7        0.0
I8        9.0
I9        7.0
Name: 50%, dtype: object

**Non-Aggregated median**:
```
I1       10.0
I10       0.0
I11       2.0
I12    6299.0
I13       5.0
I2      225.0
I3        4.0
I4       39.0
I5        6.0
I6        0.0
I7        0.0
I8        9.0
I9        7.0
Name: 50%, dtype: object
```

In [9]:
mean

I1        34.387808
I2       417.565493
I3         7.196818
I4       127.777254
I5        22.602954
I6         1.719371
I7         0.161513
I8       112.786085
I9         9.704844
I10        0.298463
I11        4.119612
I12    21324.027288
I13        8.947549
dtype: float64

**Non-Aggregated mean**:
```
I1        34.387808
I2       417.565493
I3         7.196818
I4       127.777254
I5        22.602954
I6         1.719371
I7         0.161513
I8       112.786085
I9         9.704844
I10        0.298463
I11        4.119612
I12    21324.027288
I13        8.947549
dtype: float64
```

In [10]:
std

I1       460.496390
I2       689.674436
I3         9.828270
I4       615.158633
I5        81.441234
I6        21.590568
I7         2.059838
I8       391.429545
I9        16.212257
I10        0.569462
I11        7.121497
I12    35096.394381
I13       22.055694
dtype: float64

**Non-Aggregated std**:
```
I1       460.496390
I2       689.674436
I3         9.828270
I4       615.158633
I5        81.441234
I6        21.590568
I7         2.059838
I8       391.429545
I9        16.212257
I10        0.569462
I11        7.121497
I12    35096.394381
I13       22.055694
dtype: float64
```

In [11]:
count

I1     3408640361
I2     3779087859
I3     3162206758
I4     2748262732
I5     2484922855
I6     3829411492
I7     4073075259
I8     4195197692
I9     4195197691
I10    3829411492
I11    2484922855
I12    4124955977
I13    3162206758
dtype: int64

**Non-Aggregated std**:
```
I1     3408640361
I2     3779087859
I3     3162206758
I4     2748262732
I5     2484922855
I6     3829411492
I7     4073075259
I8     4195197692
I9     4195197691
I10    3829411492
I11    2484922855
I12    4124955977
I13    3162206758
dtype: int64
```

In [12]:
mins

I1     1
I2     1
I3     0
I4     0
I5     1
I6     0
I7     0
I8    -1
I9     0
I10    0
I11    1
I12    0
I13    0
dtype: int64

**Non-Aggregated mins**:
```
I1     1
I2     1
I3     0
I4     0
I5     1
I6     0
I7     0
I8    -1
I9     0
I10    0
I11    1
I12    0
I13    0
dtype: int64
```

In [13]:
maxes

I1        65535
I2         8000
I3         4933
I4       746810
I5         8000
I6        69472
I7         5277
I8       400947
I9        16050
I10          21
I11         566
I12    35081696
I13       21843
dtype: int64

**Non-Aggregated maxes**:
```
I1        65535
I2         8000
I3         4933
I4       746810
I5         8000
I6        69472
I7         5277
I8       400947
I9        16050
I10          21
I11         566
I12    35081696
I13       21843
dtype: int64
```