In [1]:
import pandas as pd
import numpy as np

from dask.distributed import LocalCluster, Client
import dask.dataframe as dd
import dask

import shutil
import time
import os

path = "dummy_dataset.parquet"
path_shuffled = "dummy_dataset_shuffled.parquet"
random_seed = 42

### Create Distributed-Dask Cluster and Client

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:42410  Dashboard: http://127.0.0.1:40728/status,Cluster  Workers: 10  Cores: 80  Memory: 540.95 GB


### Generate Synthetic Dataset (Using Dask's `timeseries`)

In [3]:
# Start by creating a large dummy dataset
# using Dask's internal `timeseries` example

ts = time.time()
ddf = dask.datasets.timeseries(
    '1900', # Start year
    '1925', # End year
    #'2001', # End year
    freq='1S', # Note: Use '1S' to create a "huge" df (3152995200 rows)
    partition_freq='1Y',
    seed=random_seed,
    dtypes={
        'value': float,
        'name': str,
        'id1000': int,
    },  # data types
    id1000_lam=1000,  # control number of items in id column
)
time_create = time.time() - ts
print("time_create:",time_create)

time_create: 0.014232635498046875


In [4]:
ddf.divisions[0]

Timestamp('1900-12-31 00:00:00', freq='A-DEC')

In [5]:
ddf.npartitions

24

### Write a Distributed Parquet Dataset (File-per-Partition)

In [6]:
# Use Dask's to_parquet to write data to a pyarrow-parquet "dataset"

ts = time.time()
if os.path.isdir(path):
    shutil.rmtree(path)
time_clean = time.time() - ts
print("time_clean:",time_clean)

ts = time.time()
ddf.to_parquet(path, write_index=True, engine="pyarrow")
time_gen_and_write = time.time() - ts
print("time_gen_and_write:",time_gen_and_write)

time_clean: 0.08940005302429199
time_gen_and_write: 268.7560443878174


### Test: Read Dataset and Write back Shuffled/Processed Data (CPU Version)

In [7]:
# Read back the parquet dataset on CPU `read_parquet`

ts = time.time()
ddf_read = dd.read_parquet(path, index="timestamp", gather_statistics=True, engine="pyarrow")
time_read_meta = time.time() - ts
print("time_read_meta", time_read_meta)

time_read_meta 1.3047316074371338


In [8]:
ddf_read.divisions[0]

Timestamp('1900-12-31 00:00:00')

In [9]:
ddf_read.npartitions

24

In [10]:
# Lets add a new column to shuffle the dataset

def _assign_rand(df):
    return df.assign(sort_ind=np.random.permutation(len(df)))
ddf_read_new = ddf_read.map_partitions(_assign_rand)

In [11]:
# Set index to column `sort_ind` to shuffle the dataset

ts = time.time()
ddf_read_shuffled = ddf_read_new.set_index('sort_ind')
time_set_index_sort_ind = time.time() - ts
print("time_set_index_sort_ind", time_set_index_sort_ind)

time_set_index_sort_ind 63.44454836845398


In [12]:
ddf_read_shuffled.divisions[0]

0

In [13]:
ddf_read_shuffled.npartitions

24

In [14]:
# Show time for shuffle (Shouldn't actually do this step in practice)

ts = time.time()
ddf_read_shuffled.compute()
time_shuffle = time.time() - ts
print("time_shuffle:",time_shuffle)

time_shuffle: 300.01719093322754


In [15]:
# Write out "shuffled" dataset
# Total time is shuffle + write

ts = time.time()
if os.path.isdir(path_shuffled):
    shutil.rmtree(path_shuffled)
time_clean = time.time() - ts
print("time_clean:",time_clean)

ts = time.time()
ddf_read_shuffled.to_parquet(path_shuffled, write_index=False, engine="pyarrow")
time_gen_and_write = time.time() - ts
print("time_gen_and_write:",time_gen_and_write)

time_clean: 0.07477879524230957
time_gen_and_write: 295.2648060321808


In [16]:
# Read back the "shuffled" parquet dataset on CPU
ts = time.time()
ddf_read_shuffled_2 = dd.read_parquet(path_shuffled, index=False, gather_statistics=True ,engine="pyarrow")
time_read_meta = time.time() - ts
print("time_read_meta", time_read_meta)

time_read_meta 1.7234370708465576


In [17]:
ddf_read_shuffled_2.compute().head()

Unnamed: 0,id1000,name,value
0,1021,Xavier,0.823693
1,1025,George,-0.172319
2,982,Frank,-0.017929
3,994,Ursula,-0.817381
4,992,Michael,-0.356457


In [18]:
client.close()
cluster.close()