# AWS Slurm Parallel Data Read Speeds with Dask

## Imports

In [1]:
import dask.array as dsa
import numpy as np
import dask.dataframe as dd
from contextlib import contextmanager
import xarray as xr
import intake
import time
import dask
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib.ticker import MaxNLocator
import matplotlib.colors
import pandas as pd
from scipy.stats import sem
import tiledb

## Slurm Job Script Configuration

In [2]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(threads_per_worker=2, memory_limit='8GB')
client = Client(cluster)
cluster

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 30
Total threads: 60,Total memory: 223.52 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34017,Workers: 30
Dashboard: http://127.0.0.1:8787/status,Total threads: 60
Started: Just now,Total memory: 223.52 GiB

0,1
Comm: tcp://127.0.0.1:38723,Total threads: 2
Dashboard: http://127.0.0.1:39717/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:46323,
Local directory: /home/ubuntu/dask-worker-space/worker-qub81ikl,Local directory: /home/ubuntu/dask-worker-space/worker-qub81ikl

0,1
Comm: tcp://127.0.0.1:41523,Total threads: 2
Dashboard: http://127.0.0.1:42573/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:38341,
Local directory: /home/ubuntu/dask-worker-space/worker-hihj73g7,Local directory: /home/ubuntu/dask-worker-space/worker-hihj73g7

0,1
Comm: tcp://127.0.0.1:40787,Total threads: 2
Dashboard: http://127.0.0.1:42929/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:36667,
Local directory: /home/ubuntu/dask-worker-space/worker-_wxbfolw,Local directory: /home/ubuntu/dask-worker-space/worker-_wxbfolw

0,1
Comm: tcp://127.0.0.1:44431,Total threads: 2
Dashboard: http://127.0.0.1:38433/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:36151,
Local directory: /home/ubuntu/dask-worker-space/worker-amqnr3d3,Local directory: /home/ubuntu/dask-worker-space/worker-amqnr3d3

0,1
Comm: tcp://127.0.0.1:36009,Total threads: 2
Dashboard: http://127.0.0.1:36183/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:42079,
Local directory: /home/ubuntu/dask-worker-space/worker-eirjyfzu,Local directory: /home/ubuntu/dask-worker-space/worker-eirjyfzu

0,1
Comm: tcp://127.0.0.1:45069,Total threads: 2
Dashboard: http://127.0.0.1:46221/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:41093,
Local directory: /home/ubuntu/dask-worker-space/worker-xz1t_rsy,Local directory: /home/ubuntu/dask-worker-space/worker-xz1t_rsy

0,1
Comm: tcp://127.0.0.1:32981,Total threads: 2
Dashboard: http://127.0.0.1:45333/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:40835,
Local directory: /home/ubuntu/dask-worker-space/worker-em_l5775,Local directory: /home/ubuntu/dask-worker-space/worker-em_l5775

0,1
Comm: tcp://127.0.0.1:45929,Total threads: 2
Dashboard: http://127.0.0.1:32965/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:45169,
Local directory: /home/ubuntu/dask-worker-space/worker-vtk0mwrk,Local directory: /home/ubuntu/dask-worker-space/worker-vtk0mwrk

0,1
Comm: tcp://127.0.0.1:42489,Total threads: 2
Dashboard: http://127.0.0.1:38615/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:39697,
Local directory: /home/ubuntu/dask-worker-space/worker-arvt10wj,Local directory: /home/ubuntu/dask-worker-space/worker-arvt10wj

0,1
Comm: tcp://127.0.0.1:37165,Total threads: 2
Dashboard: http://127.0.0.1:35401/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:33139,
Local directory: /home/ubuntu/dask-worker-space/worker-7pgkw9jm,Local directory: /home/ubuntu/dask-worker-space/worker-7pgkw9jm

0,1
Comm: tcp://127.0.0.1:39241,Total threads: 2
Dashboard: http://127.0.0.1:45673/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:44829,
Local directory: /home/ubuntu/dask-worker-space/worker-mbs98qjv,Local directory: /home/ubuntu/dask-worker-space/worker-mbs98qjv

0,1
Comm: tcp://127.0.0.1:37045,Total threads: 2
Dashboard: http://127.0.0.1:41717/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34941,
Local directory: /home/ubuntu/dask-worker-space/worker-6jo_wmo2,Local directory: /home/ubuntu/dask-worker-space/worker-6jo_wmo2

0,1
Comm: tcp://127.0.0.1:44195,Total threads: 2
Dashboard: http://127.0.0.1:33241/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:36943,
Local directory: /home/ubuntu/dask-worker-space/worker-wup05r6t,Local directory: /home/ubuntu/dask-worker-space/worker-wup05r6t

0,1
Comm: tcp://127.0.0.1:40305,Total threads: 2
Dashboard: http://127.0.0.1:36489/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:40067,
Local directory: /home/ubuntu/dask-worker-space/worker-mnfap_us,Local directory: /home/ubuntu/dask-worker-space/worker-mnfap_us

0,1
Comm: tcp://127.0.0.1:36191,Total threads: 2
Dashboard: http://127.0.0.1:36697/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:35275,
Local directory: /home/ubuntu/dask-worker-space/worker-msqs3qfc,Local directory: /home/ubuntu/dask-worker-space/worker-msqs3qfc

0,1
Comm: tcp://127.0.0.1:36703,Total threads: 2
Dashboard: http://127.0.0.1:43289/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:43511,
Local directory: /home/ubuntu/dask-worker-space/worker-uno32a3o,Local directory: /home/ubuntu/dask-worker-space/worker-uno32a3o

0,1
Comm: tcp://127.0.0.1:37459,Total threads: 2
Dashboard: http://127.0.0.1:34327/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:40253,
Local directory: /home/ubuntu/dask-worker-space/worker-y9go0jws,Local directory: /home/ubuntu/dask-worker-space/worker-y9go0jws

0,1
Comm: tcp://127.0.0.1:45339,Total threads: 2
Dashboard: http://127.0.0.1:40401/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:34843,
Local directory: /home/ubuntu/dask-worker-space/worker-o59_hurq,Local directory: /home/ubuntu/dask-worker-space/worker-o59_hurq

0,1
Comm: tcp://127.0.0.1:38501,Total threads: 2
Dashboard: http://127.0.0.1:41119/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:39523,
Local directory: /home/ubuntu/dask-worker-space/worker-zhou939e,Local directory: /home/ubuntu/dask-worker-space/worker-zhou939e

0,1
Comm: tcp://127.0.0.1:39967,Total threads: 2
Dashboard: http://127.0.0.1:38399/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:36531,
Local directory: /home/ubuntu/dask-worker-space/worker-9dc5z932,Local directory: /home/ubuntu/dask-worker-space/worker-9dc5z932

0,1
Comm: tcp://127.0.0.1:41721,Total threads: 2
Dashboard: http://127.0.0.1:37629/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:43507,
Local directory: /home/ubuntu/dask-worker-space/worker-8_lg6z7n,Local directory: /home/ubuntu/dask-worker-space/worker-8_lg6z7n

0,1
Comm: tcp://127.0.0.1:35073,Total threads: 2
Dashboard: http://127.0.0.1:35441/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:42071,
Local directory: /home/ubuntu/dask-worker-space/worker-v3zq8vff,Local directory: /home/ubuntu/dask-worker-space/worker-v3zq8vff

0,1
Comm: tcp://127.0.0.1:34321,Total threads: 2
Dashboard: http://127.0.0.1:35149/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:39863,
Local directory: /home/ubuntu/dask-worker-space/worker-umj5n7sy,Local directory: /home/ubuntu/dask-worker-space/worker-umj5n7sy

0,1
Comm: tcp://127.0.0.1:46795,Total threads: 2
Dashboard: http://127.0.0.1:45273/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:45513,
Local directory: /home/ubuntu/dask-worker-space/worker-tmb5hb1k,Local directory: /home/ubuntu/dask-worker-space/worker-tmb5hb1k

0,1
Comm: tcp://127.0.0.1:40429,Total threads: 2
Dashboard: http://127.0.0.1:33537/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:46079,
Local directory: /home/ubuntu/dask-worker-space/worker-zr9ljsg8,Local directory: /home/ubuntu/dask-worker-space/worker-zr9ljsg8

0,1
Comm: tcp://127.0.0.1:45357,Total threads: 2
Dashboard: http://127.0.0.1:39373/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:39549,
Local directory: /home/ubuntu/dask-worker-space/worker-xblhq6gv,Local directory: /home/ubuntu/dask-worker-space/worker-xblhq6gv

0,1
Comm: tcp://127.0.0.1:37903,Total threads: 2
Dashboard: http://127.0.0.1:39019/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:40251,
Local directory: /home/ubuntu/dask-worker-space/worker-4_xtqhq_,Local directory: /home/ubuntu/dask-worker-space/worker-4_xtqhq_

0,1
Comm: tcp://127.0.0.1:36445,Total threads: 2
Dashboard: http://127.0.0.1:41511/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:41063,
Local directory: /home/ubuntu/dask-worker-space/worker-7fjx3qdh,Local directory: /home/ubuntu/dask-worker-space/worker-7fjx3qdh

0,1
Comm: tcp://127.0.0.1:38261,Total threads: 2
Dashboard: http://127.0.0.1:38209/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:37959,
Local directory: /home/ubuntu/dask-worker-space/worker-9bleroha,Local directory: /home/ubuntu/dask-worker-space/worker-9bleroha

0,1
Comm: tcp://127.0.0.1:42621,Total threads: 2
Dashboard: http://127.0.0.1:37983/status,Memory: 7.45 GiB
Nanny: tcp://127.0.0.1:42031,
Local directory: /home/ubuntu/dask-worker-space/worker-yft8g0mb,Local directory: /home/ubuntu/dask-worker-space/worker-yft8g0mb


## Benchmarking Setup

In [3]:
class DevNullStore:
    def __init__(self):
        pass
    def __setitem__(*args, **kwargs):
        pass

null_store = DevNullStore()

############################################################################################################################

class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []
        self.names = []
        
    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)
        
    def dataframe(self):
        return pd.DataFrame(self.diagnostics)
    
diag_timer = DiagnosticTimer()

############################################################################################################################

def name(fileType, daf): 
    globals()[f"df_{fileType}"] = daf
    diag_timer.names.append(globals()[f"df_{fileType}"])
    
    global df, da
    del df, da
    diag_timer.diagnostics = []
    
############################################################################################################################     

def total_nthreads():
    return sum([v for v in client.nthreads().values()])

def total_ncores():
    return sum([v for v in client.ncores().values()])

def total_workers():
    return len(client.ncores())

############################################################################################################################

class mainLoop:
    def errorCalc(self, df0):
        global tests
        newVals = []
        info = []
        thrPut = df0['throughput_Mbps']
        rTime = df0['runtime']
        for i in np.linspace(0, len(thrPut)-tests, int(len(thrPut)/tests), dtype='int'):
            means = thrPut[slice(i,(i+tests))].mean()
            runtime = rTime[slice(i,(i+tests))].mean()
            errors = sem(thrPut[slice(i,(i+tests))])
            error_kwargs = dict(runtime = runtime, throughput_Mbps = means, errors = errors)
            info.append(df0.iloc[i, 0:7])
            newVals.append(error_kwargs)
        
        df1 = pd.DataFrame(info, index=range(len(info)))
        df2 = pd.DataFrame(newVals)
        df = pd.concat([df1, df2], axis=1)
        return df

    def loop(self, da, diag_kwargs):
        global tests, max_workers, worker_step, cluster
        for nworkers in np.flip(np.arange(max_workers, 0, -worker_step)):
            cluster.scale(nworkers)
            time.sleep(10)
            client.wait_for_workers(nworkers)
            print(cluster)
            print('Number of Workers:', nworkers)
            for i in range(tests):
                with diag_timer.time(nworkers=total_workers(), nthreads=total_nthreads(), ncores=total_ncores(),
                                     **diag_kwargs):
                    future = dsa.store(da, null_store, lock=False, compute=False)
                    dask.compute(future, retries=5)
                del future
        
        df = diag_timer.dataframe()
        df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
        if i != 0:
            df = self.errorCalc(df)
        return df

mainLoop = mainLoop()

---------------

## Perform Benchmarking

###  User Input for Testing Conditions

Here, the user will define the testing benchmarking conditions:
* **`tests = (int)`:** The number of times each individual file format will be read for measurement. Entering a number greater than 1 will take much longer to run, but results will include errors & throughput plot will have error bars.


* **`max_workers = (int)`:** Maximum amount of parallel reads to be tested.


* **`worker_step = (int)`:** Workers will be reduced by this number starting from the value of `max_workers` until the lowest possible value is reached. For instance, when `max_workers = 8` & `worker_step = 2`, the resulting worker scaling scheme will be `[2, 4, 6, 8]`.


* **`root = (string)`:** Root uniform resource identifier (URI) of the object storage location. Can be changed to a public URL for public data.


* **`data = (string)`:** The data set to test. Within the `gs://cloud-data-benchmarks` bucket, each file format begins with the same naming convention, with the only difference being the extension at the end of the file name -- e.g. `.nc`, `.zarr`, etc. If the user is providing their own data and bucket, ensure that the naming convention follows what was done for this notebook, or hardcode the object storage URIs in each applicable function call.

Note: When using a data set that only has gridded formats available in cloud object storage, only run the **Gridded Data** section of the notebook. The **Tabluar Data** section will *not* work.

In [4]:
# Loop Parameters
tests = 5
max_workers = 40
worker_step = 4

# Data Location
root = 'gs://cloud-data-benchmarks/'
data = 'slp.1948-2009.100MB'

# Cloud Storage Access Token File
token = '/home/ubuntu/cloud-data-files/cloud-data-benchmarks.json'

### Tabular Data

#### CSV

##### Single File

In [None]:
tic1 = time.time()
df0 = dd.read_csv(root + data + '.csv', assume_missing=True, names=['lon', 'lat', 'z'])
toc1 = time.time()
connectTime = toc1-tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='CSV', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('csv', df)
df_csv

##### Multiple Files

In [None]:
tic1 = time.time()
df0 = dd.read_csv(root + data + '.50MB.partcsv/*', assume_missing=True, names=['lon', 'lat', 'z'])
toc1 = time.time()
connectTime = toc1-tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Partitioned CSV', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('partcsv', df)
df_partcsv

#### Parquet

In [None]:
tic1 = time.time()
df0 = dd.read_parquet(root + data + '.100MB.partparquet/*')
toc1 = time.time()
connectTime = toc1 - tic1

da = df0.to_dask_array(lengths=True)
del df0
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Partitioned Parquet', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('partparquet', df)
df_partparquet

### Gridded Data

In [5]:
intake.open_netcdf(root + data + '.nc',
                   storage_options={'token':token}).to_dask().data_vars # Lists all data variables contained in the data set.

Data variables:
    SLP      (TIME, LAT, LON) float32 ...

`variable = (string)` Choose a data variable from the list in the output above to use in read testing.

In [6]:
variable = 'SLP'

#### NetCDF

In [None]:
tic1 = time.time()
ds = intake.open_netcdf(root + data + '.nc', storage_options={'token':token}).to_dask()
toc1 = time.time()
connectTime = toc1-tic1

# Set Dask chunks to match internal chunks
internal_chunks = ds[variable].encoding['chunksizes']
coords = ds[variable].dims
da = ds[variable].chunk(chunks=dict(zip(coords, internal_chunks))).data

chunksize = np.prod(da.chunksize) * da.dtype.itemsize
del ds
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='NetCDF', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('netcdf', df)
df_netcdf

#### Zarr

##### Zarr Array

In [7]:
tic1 = time.time()
da = dsa.from_zarr(root + data + '.zarray', storage_options={'token':token})
toc1 = time.time()
connectTime = toc1 - tic1
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

Unnamed: 0,Array,Chunk
Bytes,6.09 GiB,99.45 MiB
Shape,"(90520, 94, 192)","(22630, 24, 48)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.09 GiB 99.45 MiB Shape (90520, 94, 192) (22630, 24, 48) Count 65 Tasks 64 Chunks Type float32 numpy.ndarray",192  94  90520,

Unnamed: 0,Array,Chunk
Bytes,6.09 GiB,99.45 MiB
Shape,"(90520, 94, 192)","(22630, 24, 48)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray


In [8]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Zarr Array', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('zarray', df)
df_zarray

LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=4, threads=8, memory=29.80 GiB)
Number of Workers: 4
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=8, threads=16, memory=59.60 GiB)
Number of Workers: 8
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=12, threads=24, memory=89.41 GiB)
Number of Workers: 12
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=16, threads=32, memory=119.21 GiB)
Number of Workers: 16
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=20, threads=40, memory=149.01 GiB)
Number of Workers: 20
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=24, threads=48, memory=178.81 GiB)
Number of Workers: 24
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=28, threads=56, memory=208.62 GiB)
Number of Workers: 28
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=32, threads=64, memory=238.42 GiB)
Number of Workers: 32
LocalCluster(a1e26dca, 'tcp://127.0.0.1:34017', workers=36, threads=72, memory=268.22 GiB)
Number of Wor

Unnamed: 0,nworkers,nthreads,ncores,nbytes,chunksize,format,connectTime,runtime,throughput_Mbps,errors
0,4,8,8,6534819840,104279040,Zarr Array,0.033587,4.923106,1355.299088,87.987516
1,8,16,16,6534819840,104279040,Zarr Array,0.033587,2.796472,2345.260059,69.554217
2,12,24,24,6534819840,104279040,Zarr Array,0.033587,2.037848,3226.592497,127.701378
3,16,32,32,6534819840,104279040,Zarr Array,0.033587,2.044801,3236.257724,176.628887
4,20,40,40,6534819840,104279040,Zarr Array,0.033587,2.281933,2888.934548,133.864162
5,24,48,48,6534819840,104279040,Zarr Array,0.033587,1.624337,4045.843272,148.887363
6,28,56,56,6534819840,104279040,Zarr Array,0.033587,1.490463,4479.406811,295.484287
7,32,64,64,6534819840,104279040,Zarr Array,0.033587,2.125349,3583.45505,522.095915
8,36,72,72,6534819840,104279040,Zarr Array,0.033587,1.676586,3990.330425,328.522161
9,40,80,80,6534819840,104279040,Zarr Array,0.033587,1.348293,4856.876958,109.819266


##### Zarr Hierachical Group

In [None]:
tic1 = time.time()
ds = xr.open_zarr(store = root + data + '.zarr', consolidated=True, storage_options={'token':token})
toc1 = time.time()
connectTime = toc1-tic1
da = ds[variable].data
del ds
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Zarr Group', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('zgroup', df)
df_zgroup

#### TileDB Embedded 

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = token
tic1 = time.time()
da = dsa.from_tiledb(root + data + '.tldb',
                     storage_options={'sm.compute_concurrency_level': 4, 'sm.io_concurrency_level': 4})
toc1 = time.time()
connectTime = toc1 - tic1
chunksize = np.prod(da.chunksize) * da.dtype.itemsize
da

In [None]:
diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='TileDB Embedded', connectTime=connectTime)

df = mainLoop.loop(da, diag_kwargs)
name('tldb', df)
df_tldb

-------------------------------------------------------------------------------------------------------------------------------

In [None]:
cluster.scale(0)

In [None]:
client.close()
cluster.close()

## Plot Throughput

In [None]:
class errorPlot:
    def plot(self):
        x = self.df['nworkers']
        y = self.df['throughput_Mbps']
        error = self.df['errors']
        plt.errorbar(x, y, yerr=error, color=self.c, fmt='o', capsize=5, capthick=2)
        
    def errorCheck(self, daf, color):
        self.c = color
        self.df = daf
        try:
            self.plot()
        except:
            pass
        else:
            self.plot()
            
errorPlot = errorPlot()

############################################################################################################################

color = cm.rainbow(np.linspace(0,1,len(diag_timer.names)))
legend = []
df_results = pd.concat(diag_timer.names, ignore_index=True)

for i in range(len(diag_timer.names)):
    legend.append(diag_timer.names[i]['format'][1])
    c = matplotlib.colors.to_hex(color[i,:], keep_alpha=True)
    
    if i == 0:
        ax = diag_timer.names[i].plot(x='nworkers', y='throughput_Mbps', kind='line', color=c, marker='o')
    else:
        diag_timer.names[i].plot(x='nworkers', y='throughput_Mbps', kind='line', color=c, ax=ax, marker='o')
        
    errorPlot.errorCheck(diag_timer.names[i], c) 
    plt.grid(True)
    plt.title('Cloud Data Read Speeds with Dask')
    plt.xlabel('Number of Parallel Reads')
    plt.ylabel('Throughput (Mbps)')
    plt.legend(legend, bbox_to_anchor=[1.25, 0.5], loc='center', title='Store Formats')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    #plt.yscale('symlog') ACTIVATE THIS LINE IF YOU ARE USING A LARGE AMOUNT OF WORKERS

In [None]:
df_results