# Rechunk the kerchunked dataset

In [1]:
import fsspec
import fsspec.implementations.reference
import zarr
import xarray as xr
from pathlib import Path

from rechunker import rechunk

In [2]:
fsspec.__version__

'2023.3.0+13.g95eb5f9'

In [3]:
import rechunker
rechunker.__version__

'0.5.1'

In [4]:
import zarr
zarr.__version__

'2.13.3'

#### Start a Dask Gateway cluster
Use a custom helper function `ebd.start_dask_cluster` to set options on this cluster.  We don't have to use this helper, it just cuts down on lines of code in notebooks.

In [5]:
import sys
import os
sys.path.append('/shared/users/rsignell/lib')
import ebdpy as ebd
os.environ['AWS_PROFILE'] = 'esip-qhub'  # use env vars for AWS credentials to write

client, cluster, gateway = ebd.start_dask_cluster(
    profile=os.environ['AWS_PROFILE'],
    worker_max=30,
    region='us-west-2', 
    worker_profile='Medium Worker',
    use_existing_cluster=False,
    adaptive_scaling=False, 
    wait_for_cluster=False, 
    propagate_env=True)

Region: us-west-2
No Cluster running.
Starting new cluster.
Setting Cluster Environment Variable AWS_DEFAULT_REGION us-west-2
Setting Fixed Scaling workers=30
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://nebari.esipfed.org/gateway/clusters/dev.62dbc6b34d734c7990efe16cb454e36e/status
Propagating environment variables to workers
Using environment: users/users-pangeo


#### Open Entire Kerchunked Dataset, lazy loaded from parquet

In [None]:
#client.close()

In [7]:
%%time
# references are on an OSN pod (no credentials needed)
url = 's3://rsignellbucket2/noaa/nwm/grid1km/refs/'

target_opts = {'anon':True, 'skip_instance_cache':True,
              'client_kwargs': {'endpoint_url': 'https://renc.osn.xsede.org'}}

# netcdf files are on the AWS public dataset program (no credentials needed)
remote_opts = {'anon':True}

fs = fsspec.filesystem("reference", fo=url, 
                       remote_protocol='s3', remote_options=remote_opts,
                      target_options=target_opts)
m = fs.get_mapper("")

CPU times: user 73 µs, sys: 5 µs, total: 78 µs
Wall time: 80.8 µs


In [None]:
ds = xr.open_dataset(m, engine='zarr', backend_kwargs=dict(consolidated=False))

In [6]:
ds = xr.open_dataset(m, engine='zarr', chunks={}, 
                     backend_kwargs=dict(consolidated=False))

KeyboardInterrupt: 

In [None]:
s3_lazy_refs = 's3://esip-qhub-public/nwm/LDAS-1k/lazyrefs'

In [None]:
%%time
fs = fsspec.implementations.reference.DFReferenceFileSystem(s3_lazy_refs, lazy=True, target_options={"anon": True},
                                                            remote_protocol="s3", remote_options={"anon": True})
m = fs.get_mapper("")

ds = xr.open_dataset(m, engine="zarr", chunks={'time':1, 'y':3840, 'x':4608}, 
                     backend_kwargs=dict(consolidated=False))

In [None]:
ds

#### Select high-priority vars only

In [None]:
ds = ds[['ACCET', 'SNEQV', 'FSNO']]

In [None]:
ds

In [None]:
ds['ACCET'].isel(time=slice(0,72))

In [None]:
ds.attrs

#### set up zarr stores for temporary and final zarr stores on S3

In [None]:
fs_write = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)

In [None]:
temp_name = 'esip-qhub/testing/usgs/nwm1km.tmp'
target_name = 'esip-qhub/testing/usgs/nwm1km.zarr'

In [None]:
try:
    fs_write.rm(temp_name, recursive=True)
except:
    pass
try:
    fs_write.rm(target_name, recursive=True)
except:
    pass

In [None]:
temp_store = fs_write.get_mapper(temp_name)
target_store = fs_write.get_mapper(target_name)

In [None]:
a = int(len(ds.time)/(72))
a

In [None]:
b = (len(ds.x) * len(ds.y))/((96*2)*(132*2))
b

In [None]:
a/b

In [None]:
#client.close()

In [None]:
#from dask.distributed import Client

In [None]:
#client = Client(threads_per_worker=1)

In [None]:
#client.amm.start()

In [None]:
import zarr.storage
from numcodecs import Zstd
zarr.storage.default_compressor = Zstd(level=9)

In [None]:
(192 * 264 * 72)*4/1e6

#### Rechunk!

In [None]:
rechunked = rechunk(ds.isel(time=slice(0,72)), target_chunks={'y':96*2, 'x':132*2, 'time':72},
                    target_store=target_store, temp_store=temp_store, max_mem='2.8GiB')

In [None]:
%%time
rechunked.execute(retries=10)

In [None]:
zarr.convenience.consolidate_metadata(target_store)

#### Explore the rechunked dataset

In [None]:
ds2 = xr.open_dataset(target_store, engine='zarr', chunks={})

In [None]:
ds2

In [None]:
ds2.ACCET

In [None]:
import hvplot.xarray


In [None]:
ds2.ACCET[:,2000,2000].hvplot(x='time')