# Google Cloud Parallel Data Read Speeds with Dask

## Imports & Client Initialization

In [1]:
import dask.array as dsa
import fsspec
import numpy as np
import dask.dataframe as dd
from contextlib import contextmanager
import intake
import time
import dask
from matplotlib import pyplot as plt
import pandas as pd

In [2]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)

distributed.diskutils - INFO - Found stale lock file and directory '/home/ubuntu/dask-worker-space/worker-uiw8cpiu', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ubuntu/dask-worker-space/worker-uk4tq3ws', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ubuntu/dask-worker-space/worker-udq4p0di', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ubuntu/dask-worker-space/worker-zhl_bg79', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/ubuntu/dask-worker-space/worker-em0h4d8m', purging


## Benchmarking Setup

First, we will create this null storage object (GitHub Abernathy, Ryan). To measure our throughput, all the data will need to be accessed at a single time, which can be achieved by storing the data into this null storage target.

In [3]:
class DevNullStore:
    def __init__(self):
        pass
    def __setitem__(*args, **kwargs):
        pass

null_store = DevNullStore()

This class is taken directly from GitHub
Abernathy, Ryan, 

The Diagnostic Timer will keep track of data retrieval times and store them within a pandas dataframe for processing.

In [4]:
class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []
        
    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)
        
    def dataframe(self):
        return pd.DataFrame(self.diagnostics)
    
diag_timer = DiagnosticTimer()

## Perform Benchmarking

Though we are accessing the data from different formats, the core process will be the exact same. Dask is lazy by default, which means we will be using the previously defined null_store to measure throughput. Converting from DataFrames to Arrays, for example, will not affect the access speed because the data is not actually read from the source until we "store" it. So, these read speed should be the same whether you are using a dataframe or array.

In [5]:
token = '/home/ubuntu/Cloud-Data-Transfer-Speed-Benchmarks/cloud-data-benchmarks.json'

In [6]:
def total_nthreads():
    return sum([v for v in client.nthreads().values()])

def total_ncores():
    return sum([v for v in client.ncores().values()])

def total_workers():
    return len(client.ncores())

### CSV

#### Single File

In [7]:
tic1 = time.time()
df0 = dd.read_csv('gs://cloud-data-benchmarks/ETOPO1_Ice_g_gmt4.csv', storage_options={'token':token}, assume_missing=True)
toc1 = time.time()
connectTime = toc1-tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

Unnamed: 0,Array,Chunk
Bytes,5.21 GiB,54.47 MiB
Shape,"(233312400, 3)","(2379761, 3)"
Count,208 Tasks,104 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 5.21 GiB 54.47 MiB Shape (233312400, 3) (2379761, 3) Count 208 Tasks 104 Chunks Type float64 numpy.ndarray",3  233312400,

Unnamed: 0,Array,Chunk
Bytes,5.21 GiB,54.47 MiB
Shape,"(233312400, 3)","(2379761, 3)"
Count,208 Tasks,104 Chunks
Type,float64,numpy.ndarray


This is the main loop, pulled from (ec_20_abernathy_etal/cloud_storage.ipynb). A few modifications have been made, but most notably we also want to measure the connection time to the cloud server. These times will be different depending on the connection method made, and in this case we are using the dask.dataframe.read_csv(. . .) function. This shouldn't matter if we are purely measuring access times, but it is important when we use different connection methods in the API.

In [8]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='csv')

for nworkers in [1, 3, 4]:
    cluster.scale(nworkers)
    time.sleep(10)
    client.wait_for_workers(nworkers)
    print(nworkers)
    with diag_timer.time(nthreads=total_nthreads(), ncores=total_ncores(), nworkers=total_workers(), connectTime=connectTime, **diag_kwargs):
        future = dsa.store(da, null_store, lock=False, compute=False)
        dask.compute(future, retries=5)
    del future

1
3
4


In [9]:
df = diag_timer.dataframe()
df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
del da
df

Unnamed: 0,nthreads,ncores,nworkers,connectTime,nbytes,chunksize,format,runtime,throughput_Mbps
0,2,2,1,0.573894,5599497600,57114264,csv,73.443891,76.241843
1,6,6,3,0.573894,5599497600,57114264,csv,31.221264,179.348843
2,8,8,4,0.573894,5599497600,57114264,csv,26.074355,214.751144


#### Multiple Files

In [10]:
tic1 = time.time()
df0 = dd.read_csv('gs://cloud-data-benchmarks/csvpartitions/*', storage_options={'token':token}, assume_missing=True)
toc1 = time.time()
connectTime = toc1-tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,72.62 MiB
Shape,"(233312400, 4)","(2379761, 4)"
Count,208 Tasks,104 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.95 GiB 72.62 MiB Shape (233312400, 4) (2379761, 4) Count 208 Tasks 104 Chunks Type float64 numpy.ndarray",4  233312400,

Unnamed: 0,Array,Chunk
Bytes,6.95 GiB,72.62 MiB
Shape,"(233312400, 4)","(2379761, 4)"
Count,208 Tasks,104 Chunks
Type,float64,numpy.ndarray


In [11]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='part_csv')

for nworkers in [1, 3, 4]:
    cluster.scale(nworkers)
    time.sleep(10)
    client.wait_for_workers(nworkers)
    print(nworkers)
    with diag_timer.time(nthreads=total_nthreads(), ncores=total_ncores(), nworkers=total_workers(), connectTime=connectTime, **diag_kwargs):
        future = dsa.store(da, null_store, lock=False, compute=False)
        dask.compute(future, retries=5)
    del future

1
3
4


In [12]:
df = diag_timer.dataframe()
df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
del da
df

Unnamed: 0,nthreads,ncores,nworkers,connectTime,nbytes,chunksize,format,runtime,throughput_Mbps
0,2,2,1,0.573894,5599497600,57114264,csv,73.443891,101.65579
1,6,6,3,0.573894,5599497600,57114264,csv,31.221264,239.131791
2,8,8,4,0.573894,5599497600,57114264,csv,26.074355,286.334859
3,2,2,1,8.429575,7465996800,76152352,part_csv,81.249845,91.889366
4,6,6,3,8.429575,7465996800,76152352,part_csv,35.104092,212.681668
5,8,8,4,8.429575,7465996800,76152352,part_csv,32.68339,228.433976


### NetCDF

Read the XArray documentation about reading and writing files [here](https://docs.xarray.dev/en/stable/user-guide/io.html).

In [None]:
tic1 = time.time()
data = intake.open_netcdf('gs://cloud-data-benchmarks/ETOPO1_Ice_g_gmt4.nc', chunks="auto").to_dask()
toc1 = time.time()
connectTime = toc1-tic1
data = data.to_array() # Changes from DataSet to DataArray

In [None]:
da = data.data # Retrieves raw values from wrapped Xarray DataArray object
da = dsa.rechunk(da, chunks={0:"auto", 1:5436, 2:"auto"})
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='NetCDF')

for nworkers in [1, 3, 4]:
    cluster.scale(nworkers)
    time.sleep(10)
    client.wait_for_workers(nworkers)
    print(nworkers)
    with diag_timer.time(nthreads=total_nthreads(), ncores=total_ncores(), nworkers=total_workers(), connectTime=connectTime, **diag_kwargs):
        future = dsa.store(da, null_store, lock=False, compute=False)
        dask.compute(future, retries=5)
    del future

In [None]:
df = diag_timer.dataframe()
df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
del da
df

### Parquet

Throughput will be tested in two different ways:
* A CSV file was partitioned into a DataFrame by Dask, and 1 parquet file was written per DataFrame partition. All of these files will be read into the null storage target
* A single parquet file containing all of the original CSV information will be read into the null storage target

#### Multiple Files

In [13]:
tic1 = time.time()
df0 = dd.read_parquet("gs://cloud-data-benchmarks/parquetpartitions/*")
toc1 = time.time()
connectTime = toc1 - tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

Unnamed: 0,Array,Chunk
Bytes,5.21 GiB,54.47 MiB
Shape,"(233312400, 3)","(2379761, 3)"
Count,208 Tasks,104 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 5.21 GiB 54.47 MiB Shape (233312400, 3) (2379761, 3) Count 208 Tasks 104 Chunks Type float64 numpy.ndarray",3  233312400,

Unnamed: 0,Array,Chunk
Bytes,5.21 GiB,54.47 MiB
Shape,"(233312400, 3)","(2379761, 3)"
Count,208 Tasks,104 Chunks
Type,float64,numpy.ndarray


In [14]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='part_parquet')

for nworkers in [1, 3, 4]:
    cluster.scale(nworkers)
    time.sleep(10)
    client.wait_for_workers(nworkers)
    print(nworkers)
    with diag_timer.time(nthreads=total_nthreads(), ncores=total_ncores(), nworkers=total_workers(), connectTime=connectTime, **diag_kwargs):
        future = dsa.store(da, null_store, lock=False, compute=False)
        dask.compute(future, retries=5)
    del future

1
3
4


In [15]:
df = diag_timer.dataframe()
df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
del da
df

Unnamed: 0,nthreads,ncores,nworkers,connectTime,nbytes,chunksize,format,runtime,throughput_Mbps
0,2,2,1,0.573894,5599497600,57114264,csv,73.443891,76.241843
1,6,6,3,0.573894,5599497600,57114264,csv,31.221264,179.348843
2,8,8,4,0.573894,5599497600,57114264,csv,26.074355,214.751144
3,2,2,1,8.429575,7465996800,76152352,part_csv,81.249845,68.917025
4,6,6,3,8.429575,7465996800,76152352,part_csv,35.104092,159.511251
5,8,8,4,8.429575,7465996800,76152352,part_csv,32.68339,171.325482
6,2,2,1,2.407014,5599497600,57114264,part_parquet,31.959592,175.20554
7,6,6,3,2.407014,5599497600,57114264,part_parquet,12.126901,461.741836
8,8,8,4,2.407014,5599497600,57114264,part_parquet,9.171095,610.559303


#### Single File

In [None]:
tic1 = time.time()
df0 = dd.read_parquet("gs://cloud-data-benchmarks/ETOPO1_Ice_g_gmt4.parquet")
toc1 = time.time()
connectTime = toc1 - tic1

# Memory Problem between this print statements, similar to the NetCDF issue
print("Calculating size of array")
da = df0.to_dask_array(lengths=True)
print("Size of array has been calculated")

del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

Calculating size of array




In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='parquet')

for nworkers in [1, 3, 4]:
    cluster.scale(nworkers)
    time.sleep(10)
    client.wait_for_workers(nworkers)
    print(nworkers)
    with diag_timer.time(nthreads=total_nthreads(), ncores=total_ncores(), nworkers=total_workers(), connectTime=connectTime, **diag_kwargs):
        future = dsa.store(da, null_store, lock=False, compute=False)
        dask.compute(future, retries=5)
    del future

In [None]:
df = diag_timer.dataframe()
df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
del da
df

## Zarr

In [15]:
tic1 = time.time()
da = dsa.from_zarr('gs://cloud-data-benchmarks/ETOPO1_Ice_g_gm4.zarr', component='ETOPO1_Ice_g_gmt4.zarr/z1')
toc1 = time.time()
connectTime = toc1 - tic1

chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

ArrayNotFoundError: array not found at path %r' 'ETOPO1_Ice_g_gmt4.zarr/z1'

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='part_parquet')

for nworkers in [1, 3, 4]:
    cluster.scale(nworkers)
    time.sleep(10)
    client.wait_for_workers(nworkers)
    print(nworkers)
    with diag_timer.time(nthreads=total_nthreads(), ncores=total_ncores(), nworkers=total_workers(), connectTime=connectTime, **diag_kwargs):
        future = dsa.store(da, null_store, lock=False, compute=False)
        dask.compute(future, retries=5)
    del future

In [None]:
df = diag_timer.dataframe()
df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
del da
df

## Plots