# Transfer Speeds - Compression

## Imports & Client Initialization

In [None]:
import dask.array as dsa
import numpy as np
import dask.dataframe as dd
from contextlib import contextmanager
import xarray as xr
import intake
import time
import dask
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib.ticker import MaxNLocator
import matplotlib.colors
import pandas as pd
from scipy.stats import sem
import tiledb
import socket
print(socket.gethostname())

In [None]:
import os
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

dask_dir = '/mnt/shared/dask'
conda_dir = '/var/lib/pworks/miniconda3'
conda_env = 'cloud-data'
print('Conda Directory:', conda_dir, '\nConda Environment:', conda_env)

cluster = SLURMCluster(project='ca-cloudmgmt',
                       cores=4, # Number of cores in the job
                       memory='16GB', # Worker memory limit will be memory/processes
                       processes=4, # Sets number of Dask workers. Threads per dask worker will be cores/processes
                       name='gcpv2slurmbasic', # Name of cluster
                       queue='compute', # Partition name
                       job_cpu=4, # Set this to the number of cpus per job
                       job_mem='16GB', # Amount of memory per job
                       walltime='01:00:00',
                       log_directory=os.path.join(dask_dir, 'logs'),
                       env_extra=[
                           'source {conda_sh}; conda activate {conda_env}'.format(
                           conda_sh = os.path.join(conda_dir, 'etc/profile.d/conda.sh'),
                           conda_env= conda_env
                           )
                       ],
                       header_skip=['--mem'],
                      )

client = Client(cluster)
print('Job Script:\n',cluster.job_script())

## Benchmarking Setup

In [None]:
class DevNullStore:
    def __init__(self):
        pass
    def __setitem__(*args, **kwargs):
        pass

null_store = DevNullStore()

class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []
        self.names = []
        
    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)
        
    def dataframe(self):
        return pd.DataFrame(self.diagnostics)
    
diag_timer = DiagnosticTimer()

def name(fileType, daf): 
    globals()[f"df_{fileType}"] = daf
    diag_timer.names.append(globals()[f"df_{fileType}"])
    
    global df, da
    del df, da
    diag_timer.diagnostics = []

def total_nthreads():
    return sum([v for v in client.nthreads().values()])

def total_ncores():
    return sum([v for v in client.ncores().values()])

def total_workers():
    return len(client.ncores())

class mainLoop:
    def errorCalc(self, df0):
        global tests
        newVals = []
        info = []
        thrPut = df0['throughput_Mbps']
        rTime = df0['runtime']
        for i in np.linspace(0, len(thrPut)-tests, int(len(thrPut)/tests), dtype='int'):
            means = thrPut[slice(i,(i+tests))].mean()
            runtime = rTime[slice(i,(i+tests))].mean()
            errors = sem(thrPut[slice(i,(i+tests))])
            error_kwargs = dict(runtime = runtime, throughput_Mbps = means, errors = errors)
            info.append(df0.iloc[i, 0:7])
            newVals.append(error_kwargs)
        
        df1 = pd.DataFrame(info, index=range(len(info)))
        df2 = pd.DataFrame(newVals)
        df = pd.concat([df1, df2], axis=1)
        return df

    def loop(self, da, diag_kwargs):
        global tests, max_workers, worker_step
        worker_range = np.arange(max_workers, 0, -worker_step)
        worker_range = np.insert(worker_range,0, max_workers)
        for nworkers in worker_range:
            cluster.scale(nworkers)
            time.sleep(10)
            client.wait_for_workers(nworkers)
            print('Number of Workers:', nworkers)
            for i in range(tests):
                with diag_timer.time(nworkers=total_workers(), nthreads=total_nthreads(), ncores=total_ncores(),
                                     **diag_kwargs):
                    future = dsa.store(da, null_store, lock=False, compute=False)
                    dask.compute(future, retries=5)
                del future
        
        df = diag_timer.dataframe()
        df['throughput_Mbps'] = da.nbytes / 1e6 / df['runtime']
        if i != 0:
            df = self.errorCalc(df)
        #df.drop(index=df.index[0], 
        #axis=0, 
        #inplace=True)
        #df.reset_index(drop=True, inplace=True)
        return df

mainLoop = mainLoop()

---------------

## Perform Benchmarking

In [None]:
# Loop Parameters
tests = 5
max_workers = 40
worker_step = 4

# Data Location
root = 'gs://cloud-data-benchmarks/'
data = 'ETOPO1_Ice_g_gmt4'
token = '/var/lib/pworks/cloud-data-benchmarks.json'

### Tabular Data

#### Parquet

In [None]:
compressors = ['.100MB', '.lz4', '.gzip', '.zstd'] # '.100MB' = 'snappy'

for i in compressors:
    print('Compression Algorithm:',i.split('.')[-1])
    cluster.scale(max_workers)
    client.wait_for_workers(max_workers)
    tic1 = time.time()
    df0 = dd.read_parquet(root + data + i + '.partparquet/*', storage_options={'token':token})
    toc1 = time.time()
    connectTime = toc1 - tic1

    da = df0.to_dask_array(lengths=True)
    del df0
    chunksize = np.prod(da.chunksize) * da.dtype.itemsize
    cluster.scale(0)
    if i == '.100MB':
        diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='parquet',
                           compressor='snappy', connectTime=connectTime)
    else:
        diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='parquet',
                           compressor=i.split('.')[-1], connectTime=connectTime)

    df = mainLoop.loop(da, diag_kwargs)
    name('partparquet', df)

### Gridded Data

In [None]:
intake.open_netcdf(root + data + '.100MB.nc',
                   storage_options={'token':token}).to_dask().data_vars # Lists all data variables contained in the data set.

In [None]:
variable = 'SLP'

#### Zarr

In [None]:
compressors = ['.blosclz', '.100MB', '.lz4hc', '.zlib', '.zstd', '.gzip', '.bzip2'] # '.100MB' = 'lz4'

for i in compressors:
    print('Compression Algorithm:',i.split('.')[-1])
    tic1 = time.time()
    ds = xr.open_zarr(store = root + data + i + '.zarr', consolidated=True,
                   storage_options={'token':token})
    toc1 = time.time()
    connectTime = toc1-tic1
    da = ds[variable].data
    del ds
    chunksize = np.prod(da.chunksize) * da.dtype.itemsize
    if i == '.100MB':
        diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Zarr', compressor='lz4', connectTime=connectTime)
    else:
        diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='Zarr',
                           compressor=i.split('.')[-1], connectTime=connectTime)

    df = mainLoop.loop(da, diag_kwargs)
    name('zgroup', df)

#### TileDB Embedded 

In [None]:
compressors = []
os.environ('GOOGLE_APPLICATION_CREDENTIALS')=token

for i in compressors:
    print('Compression Algorithm',i.split('.')[-1])
    tic1 = time.time()
    da = dsa.from_tiledb(root + data + '.tldb')
    toc1 = time.time()
    connectTime = toc1 - tic1
    chunksize = np.prod(da.chunksize) * da.dtype.itemsize
    if i == '.100MB':
        diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='tiledb',
                           compressor='LZ4', connectTime=connectTime)
    else:
        diag_kwargs = dict(nbytes=da.nbytes, chunksize=chunksize, format='tiledb',
                           compressor=i.split('.')[-1], connectTime=connectTime)

    df = mainLoop.loop(da, diag_kwargs)
    name('tldb', df)

-------------------------------------------------------------------------------------------------------------------------------

In [None]:
cluster.scale(0)

In [None]:
client.close()
cluster.close()

## Plot Throughput

In [None]:
df_results = pd.concat(diag_timer.names, ignore_index=True)
pd.set_option('display.max_rows', None)
df_results