# File Transformations - Chunks

In [None]:
import pandas as pd
import time
import dask.dataframe as dd
import dask.array as dsa
import zarr
import xarray as xr
import numpy as np
import intake
from contextlib import contextmanager
import tiledb

In [None]:
import os
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

dask_dir = '/mnt/shared/dask'
conda_dir = '/var/lib/pworks/miniconda3'
conda_env = 'cloud-data'
print('Conda Directory:', conda_dir, '\nConda Environment:', conda_env)

cluster = SLURMCluster(project='cg-cloudmgmt',
                       cores=4, # Number of cores in the job
                       memory='16GB', # Worker memory limit will be memory/processes
                       processes=4, # Sets number of Dask workers. Threads per dask worker will be cores/processes
                       name='gcpslurmv2basic', # Name of cluster
                       queue='compute', # Partition name
                       job_cpu=4, # Set this to the number of cpus per job
                       job_mem='16GB', # Amount of memory per job
                       walltime='01:00:00',
                       log_directory=os.path.join(dask_dir, 'logs'),
                       env_extra=[
                           'source {conda_sh}; conda activate {conda_env}'.format(
                           conda_sh = os.path.join(conda_dir, 'etc/profile.d/conda.sh'),
                           conda_env= conda_env
                           )
                       ],
                       header_skip=['--mem'],
                      )

client = Client(cluster)
print('Job Script:\n',cluster.job_script())

In [None]:
#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "/contrib/Jacob.Green/cloud-data-benchmarks.json"
#token = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')
token = '/var/lib/pworks/cloud-data-benchmarks.json'

# Bucket name/public URL that contains the data you would like to convert & data set
root = 'gs://cloud-data-benchmarks/'
data = 'slp.1948-2009'

path = root + data

## Timing Setup

In [None]:
class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []
        
    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["Preprocessing Time"] = toc - tic
        kwargs
        self.diagnostics.append(kwargs)
        
    def dataframe(self):
        df = pd.DataFrame(self.diagnostics)
        return df
    
diag_timer = DiagnosticTimer()

## Tabular Data 

In [None]:
# Names to give CSV columns. If the file does not have column names, Dask/Pandas will use your first line of data as such.
names=['lon', 'lat', 'z']

### CSV to Partitioned Parquets

In [None]:
name_function = lambda x: f"ETOPO1_Ice_g_gmt4_{x}.parquet"
partition_sizes = ['100MB', '150MB', '500MB'] # For this particular data set, these will output 50MB, 100MB, & 500MB sizes

for i in partition_sizes:
    with diag_timer.time(conversionType='csv2partparqet/' + i):
        df = dd.read_csv(path + '.csv', assume_missing=True, header=None, names=names, storage_options={'token':token})
        df = df.repartition(partition_size=i)
        dd.to_parquet(df, path + '.' + i + '.partparquet2', name_function=name_function, storage_options={'token':token})
    
    del df
    
os.system('gsutil mv gs://cloud-data-benchmarks/ETOPO1_Ice_g_gmt4.100MB.partparquet2 ' + 
             'gs://cloud-data-benchmarks/ETOPO1_Ice_g_gmt4.50MB.partparquet2')
os.system('gsutil mv gs://cloud-data-benchmarks/ETOPO1_Ice_g_gmt4.150MB.partparquet2 ' + 
             'gs://cloud-data-benchmarks/ETOPO1_Ice_g_gmt4.100MB.partparquet2')

### CSV to Partitioned CSVs

In [None]:
def name_function(i):
    return "ETOPO1_Ice_g_gmt4_" + str(i) + ".csv"

with diag_timer.time(conversionType='csv2partcsv'):
    df = dd.read_csv(path + '.csv', assume_missing=True)
    
    #df = df.repartition(partition_size='500MB') # Only use this line if you wish to change chunksize
    
    dd.to_csv(df, path + '.partcsv', name_function=name_function, storage_options={'token':token},
              header_first_partition_only=True)
    
del df

## Gridded Data

In [None]:
intake.open_netcdf(path + '.100MB.nc', storage_options={'token':token}).to_dask().data_vars 
# Lists all data variables contained in the data set.

In [None]:
variable = 'SLP'
labels = ['50MB', '100MB', '500MB'] 

### Zarr Group

In [None]:
for i in labels:
    print('Writing Size', i)
    cluster.scale(40)
    client.wait_for_workers(40)
    with diag_timer.time(conversionType='netcdf2zgroup.' + i):
        ds = intake.open_netcdf(path + '.' + i + '.nc', storage_options={'token':token}).to_dask()
        da = ds[variable]
        internal_chunks = da.encoding['chunksizes']
        coords = da.dims
        da = da.chunk(chunks=dict(zip(coords, internal_chunks)))
        ds = da.to_dataset()
        ds.to_zarr(store= path + '.' + i + '.zarr2', storage_options={'token':token}, consolidated=True)
    cluster.scale(0)
    del ds, da

### Zarr Array

In [None]:
for n in labels:
    print('Writing Size', n)
    cluster.scale(40)
    client.wait_for_workers(40)
    with diag_timer.time(conversionType='netcdf2zarray.' + n):
        ds = intake.open_netcdf(path + '.' + n + '.nc', storage_options={'token':token}).to_dask()
        da = ds[variable] # Change the variable name as needed
        internal_chunks = da.encoding['chunksizes']
        coords = da.dims
        da = da.chunk(chunks=dict(zip(coords, internal_chunks))).data
        dsa.to_zarr(da, path + '.' + n + '.zarray2', storage_options={'token':token})
    cluster.scale(0)
    del ds, da

### NetCDF to TileDB Embedded

In [None]:
config = tiledb.Config()
config['vfs.gcs.project_id'] = 'modular-magpie-167320' # Input your project ID here
ctx = tiledb.Ctx(config)
filters = [tiledb.LZ4Filter(level=5)]

for i in labels:
    uri = path + '.' + i + '.tldb'
    
    with diag_timer.time(conversionType='netcdf2tldb.' + i):
        ds = intake.open_netcdf(path + '.'+ i + '.nc').to_dask()
        da = ds[variable]
        internal_chunks = da.encoding['chunksizes']
        coords = da.dims
        da = da.chunk(chunks=dict(zip(coords, internal_chunks))).data
        
############################################################################################################################
        # TileDB Custom Schema Creation
        
        filter_list = tiledb.FilterList(filters)
        
        dims = []
        for n in range(len(coords)):
            dim = tiledb.Dim(name=coords[n], domain=(0, ds[variable].encoding['original_shape'][n]-1),
                             tile=internal_chunks[n], dtype=np.uint64, filters=filter_list)
            dims.append(dim)
            
        attr = [tiledb.Attr(name=variable, dtype=np.float32, filters=filter_list)]
        dom = tiledb.Domain(dims)
        
        schema = tiledb.ArraySchema(domain=dom, attrs=attr, sparse=False)
        tiledb.Array.create(uri, schema)
        tdb_array = tiledb.open(uri, "w")
############################################################################################################################
        
        da.to_tiledb(tdb_array, storage_options={"sm.compute_concurrency_level": 2, "sm.io_concurrency_level ": 2})
    
        # Consolidation is perfomed on the array for increased read speed from cloud object storage.
        config['sm.consolidation.mode'] = 'fragment_meta'
        ctx = tiledb.Ctx(config)
        tiledb.consolidate(uri, ctx=ctx)
        config['sm.consolidation.mode'] = 'fragments'
        ctx = tiledb.Ctx(config)
        tiledb.consolidate(uri, ctx=ctx)
        config['sm.consolidation.mode'] = 'array_meta'
        ctx = tiledb.Ctx(config)
        tiledb.consolidate(uri, ctx=ctx)
    
    del ds, da, uri, dims, attr, dom, schema, tdb_array

## Present Timing Results

In [None]:
cluster.scale(0)

In [None]:
df = diag_timer.dataframe()
df