# Google Cloud Parallel Data Read Speeds with Dask - SLURM Cluster

## Imports

In [None]:
import dask.array as dsa
import numpy as np
import dask.dataframe as dd
from contextlib import contextmanager
import xarray as xr
import intake
import time
import dask
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib.ticker import MaxNLocator
import matplotlib.colors
import pandas as pd
from scipy.stats import sem
import tiledb

## Slurm Job Script Configuration

In [None]:
import os
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

dask_dir = '/mnt/shared/dask'
conda_dir = os.path.join('/contrib', os.environ['USER'],'miniconda3')
conda_env = 'cloud-data'
dask_port = 8786
dashboard_port = 8787
print('Conda Directory:', conda_dir, '\nConda Environment:', conda_env)

cluster = SLURMCluster(project='cg-cloudmgmt',
                       cores=4, # Number of cores in the job
                       memory='32GB', # Worker memory limit will be memory/processes
                       processes=4, # Sets number of Dask workers. Threads per dask worker will be cores/processes
                       name='gcpslurmv2basic', # Name of cluster
                       queue='compute', # Partition name
                       job_cpu=4, # Set this to the number of cpus per job
                       job_mem='32GB', # Amount of memory per job
                       walltime='01:00:00',
                       log_directory=os.path.join(dask_dir, 'logs'),
                       env_extra=[
                           'source {conda_sh}; conda activate {conda_env}'.format(
                           conda_sh = os.path.join(conda_dir, 'etc/profile.d/conda.sh'),
                           conda_env= conda_env
                           )
                       ],
                       header_skip=['--mem'],
                      )

client = Client(cluster)
print('Job Script:\n',cluster.job_script())

Conda Directory: /contrib/Jacob.Green/miniconda3 
Conda Environment: cloud-data


## Benchmarking Setup

In [3]:
class DevNullStore:
    def __init__(self):
        pass
    def __setitem__(*args, **kwargs):
        pass

null_store = DevNullStore()

############################################################################################################################

class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []
        self.names = []
        
    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)
        
    def dataframe(self):
        return pd.DataFrame(self.diagnostics)
    
diag_timer = DiagnosticTimer()

############################################################################################################################

def name(fileType, daf): 
    globals()[f"df_{fileType}"] = daf
    diag_timer.names.append(globals()[f"df_{fileType}"])
    
    global df, da
    del df, da
    diag_timer.diagnostics = []
    
############################################################################################################################     

def total_nthreads():
    return sum([v for v in client.nthreads().values()])

def total_ncores():
    return sum([v for v in client.ncores().values()])

def total_workers():
    return len(client.ncores())

############################################################################################################################

class mainLoop:
    def errorCalc(self, df0):
        global tests
        newVals = []
        info = []
        thrPut = df0['throughput_Mbps']
        rTime = df0['runtime']
        for i in np.linspace(0, len(thrPut)-tests, int(len(thrPut)/tests), dtype='int'):
            means = thrPut[slice(i,(i+tests))].mean()
            runtime = rTime[slice(i,(i+tests))].mean()
            errors = sem(thrPut[slice(i,(i+tests))])
            error_kwargs = dict(runtime = runtime, throughput_Mbps = means, errors = errors)
            info.append(df0.iloc[i, 0:7])
            newVals.append(error_kwargs)
        
        df1 = pd.DataFrame(info, index=range(len(info)))
        df2 = pd.DataFrame(newVals)
        df = pd.concat([df1, df2], axis=1)
        return df

    def loop(self, da, diag_kwargs):
        global tests, max_workers, worker_step
        for nworkers in np.flip(np.arange(max_workers, 0, -worker_step)):
            cluster.scale(nworkers)
            time.sleep(10)
            client.wait_for_workers(nworkers)
            print('Number of Workers:', nworkers)
            for i in range(tests):
                with diag_timer.time(nworkers=total_workers(), nthreads=total_nthreads(), ncores=total_ncores(),
                                     **diag_kwargs):
                    future = dsa.store(da, null_store, lock=False, compute=False)
                    dask.compute(future, retries=5)
                del future
        
        df = diag_timer.dataframe()
        df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
        if i != 0:
            df = self.errorCalc(df)
        return df

mainLoop = mainLoop()

---------------

## Perform Benchmarking

###  User Input for Testing Conditions

Here, the user will define the testing benchmarking conditions:
* **`tests = (int)`:** The number of times each individual file format will be read for measurement. Entering a number greater than 1 will take much longer to run, but results will include errors & throughput plot will have error bars.


* **`max_workers = (int)`:** Maximum amount of parallel reads to be tested.


* **`worker_step = (int)`:** Workers will be reduced by this number starting from the value of `max_workers` until the lowest possible value is reached. For instance, when `max_workers = 8` & `worker_step = 2`, the resulting worker scaling scheme will be `[2, 4, 6, 8]`.


* **`root = (string)`:** Root uniform resource identifier (URI) of the object storage location. Can be changed to a public URL for public data.


* **`data = (string)`:** The data set to test. Within the `gs://cloud-data-benchmarks` bucket, each file format begins with the same naming convention, with the only difference being the extension at the end of the file name -- e.g. `.nc`, `.zarr`, etc. If the user is providing their own data and bucket, ensure that the naming convention follows what was done for this notebook, or hardcode the object storage URIs in each applicable function call.

Note: When using a data set that only has gridded formats available in cloud object storage, only run the **Gridded Data** section of the notebook. The **Tabluar Data** section will *not* work.

In [4]:
# Loop Parameters
tests = 1
max_workers = 8
worker_step = 4 # Should be the same or a multiple of the number of processes you set in SLURMCluster(...)

# Data Location
root = 'gs://cloud-data-benchmarks/'
data = 'slp.1948-2009.100MB'

# Cloud Storage Access Token File
token = '/contrib/Jacob.Green/cloud-data-benchmarks.json'

### Tabular Data

#### CSV

##### Single File

In [None]:
tic1 = time.time()
df0 = dd.read_csv(root + data + '.csv', assume_missing=True, names=['lon', 'lat', 'z'])
toc1 = time.time()
connectTime = toc1-tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='CSV', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('csv', df)
df_csv

##### Multiple Files

In [None]:
tic1 = time.time()
df0 = dd.read_csv(root + data + '.50MB.partcsv/*', assume_missing=True, names=['lon', 'lat', 'z'])
toc1 = time.time()
connectTime = toc1-tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Partitioned CSV', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('partcsv', df)
df_partcsv

#### Parquet

In [None]:
tic1 = time.time()
df0 = dd.read_parquet(root + data + '.100MB.partparquet/*')
toc1 = time.time()
connectTime = toc1 - tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Partitioned Parquet', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('partparquet', df)
df_partparquet

### Gridded Data

In [8]:
intake.open_netcdf(root + data + '.nc',
                   storage_options={'token':token}).to_dask().data_vars # Lists all data variables contained in the data set.

Data variables:
    SLP      (TIME, LAT, LON) float32 ...

`variable = (string)` Choose a data variable from the list in the output above to use in read testing.

In [9]:
variable = 'SLP'

#### NetCDF

In [None]:
tic1 = time.time()
ds = intake.open_netcdf(root + data + '.nc', storage_options={'token':token}).to_dask()
toc1 = time.time()
connectTime = toc1-tic1

# Set Dask chunks to match internal chunks
internal_chunks = ds[variable].encoding['chunksizes']
coords = ds[variable].dims
da = ds[variable].chunk(chunks=dict(zip(coords, internal_chunks))).data

chunksize = np.prod(da.chunksize) * da.dtype.itemsize
del ds
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='NetCDF', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('netcdf', df)
df_netcdf

#### Zarr

##### Zarr Array

In [None]:
tic1 = time.time()
da = dsa.from_zarr(root + data + '.zarray', storage_options={'token':token})
toc1 = time.time()
connectTime = toc1 - tic1
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Zarr Array', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('zarray', df)
df_zarray

##### Zarr Hierachical Group

In [10]:
tic1 = time.time()
ds = xr.open_zarr(store = root + data + '.zarr', consolidated=True, storage_options={'token':token})
toc1 = time.time()
connectTime = toc1-tic1
da = ds[variable].data
del ds
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

Unnamed: 0,Array,Chunk
Bytes,6.09 GiB,99.45 MiB
Shape,"(90520, 94, 192)","(22630, 24, 48)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.09 GiB 99.45 MiB Shape (90520, 94, 192) (22630, 24, 48) Count 65 Tasks 64 Chunks Type float32 numpy.ndarray",192  94  90520,

Unnamed: 0,Array,Chunk
Bytes,6.09 GiB,99.45 MiB
Shape,"(90520, 94, 192)","(22630, 24, 48)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray


In [11]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Zarr Group', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('zgroup', df)
df_zgroup

Number of Workers: 4
Number of Workers: 8


Unnamed: 0,nworkers,nthreads,ncores,nbytes,chunksize,format,connectTime,runtime,throughput_Mbps
0,4,4,4,6534819840,104279040,Zarr Group,0.539969,18.946113,344.916121
1,8,8,8,6534819840,104279040,Zarr Group,0.539969,8.561764,763.256287


#### TileDB Embedded 

In [12]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = token
tic1 = time.time()
da = dsa.from_tiledb(root + data + '.tldb',
                     storage_options={'sm.compute_concurrency_level': 4, 'sm.io_concurrency_level': 4})
toc1 = time.time()
connectTime = toc1 - tic1
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

Unnamed: 0,Array,Chunk
Bytes,6.09 GiB,99.45 MiB
Shape,"(90520, 94, 192)","(22630, 24, 48)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.09 GiB 99.45 MiB Shape (90520, 94, 192) (22630, 24, 48) Count 65 Tasks 64 Chunks Type float32 numpy.ndarray",192  94  90520,

Unnamed: 0,Array,Chunk
Bytes,6.09 GiB,99.45 MiB
Shape,"(90520, 94, 192)","(22630, 24, 48)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray


In [13]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='TileDB Embedded', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('tldb', df)
df_tldb

Number of Workers: 4
Number of Workers: 8


Unnamed: 0,nworkers,nthreads,ncores,nbytes,chunksize,format,connectTime,runtime,throughput_Mbps
0,4,4,4,6534819840,104279040,TileDB Embedded,0.298915,17.318837,377.3244
1,8,8,8,6534819840,104279040,TileDB Embedded,0.298915,9.719304,672.354722


-------------------------------------------------------------------------------------------------------------------------------

In [15]:
cluster.scale(0)

In [None]:
client.close()
cluster.close()

## Plot Throughput

In [None]:
class errorPlot:
    def plot(self):
        x = self.df['nworkers']
        y = self.df['throughput_Mbps']
        error = self.df['errors']
        plt.errorbar(x, y, yerr=error, color=self.c, fmt='o', capsize=5, capthick=2)
        
    def errorCheck(self, daf, color):
        self.c = color
        self.df = daf
        try:
            self.plot()
        except:
            pass
        else:
            self.plot()
            
errorPlot = errorPlot()

############################################################################################################################

color = cm.rainbow(np.linspace(0,1,len(diag_timer.names)))
legend = []
df_results = pd.concat(diag_timer.names, ignore_index=True)

for i in range(len(diag_timer.names)):
    legend.append(diag_timer.names[i]['format'][1])
    c = matplotlib.colors.to_hex(color[i,:], keep_alpha=True)
    
    if i == 0:
        ax = diag_timer.names[i].plot(x='nworkers', y='throughput_Mbps', kind='line', color=c, marker='o')
    else:
        diag_timer.names[i].plot(x='nworkers', y='throughput_Mbps', kind='line', color=c, ax=ax, marker='o')
        
    errorPlot.errorCheck(diag_timer.names[i], c) 
    plt.grid(True)
    plt.title('Cloud Data Read Speeds with Dask')
    plt.xlabel('Number of Parallel Reads')
    plt.ylabel('Throughput (Mbps)')
    plt.legend(legend, bbox_to_anchor=[1.25, 0.5], loc='center', title='Store Formats')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    #plt.yscale('symlog') ACTIVATE THIS LINE IF YOU ARE USING A LARGE AMOUNT OF WORKERS

In [None]:
df_results