# Rechunk the kerchunked dataset

In [None]:
import fsspec
from rechunker import rechunk
import zarr
import xarray as xr
from pathlib import Path

#### Start a Dask Gateway cluster
Use a custom helper function `ebd.start_dask_cluster` to set options on this cluster.  We don't have to use this helper, it just cuts down on lines of code in notebooks.

In [None]:
import sys
import os
sys.path.append('/shared/users/lib')
import ebdpy as ebd
os.environ['AWS_PROFILE'] = 'esip-qhub'  # use env vars for AWS credentials to write

client,cluster = ebd.start_dask_cluster(
    profile=os.environ['AWS_PROFILE'],
    worker_max=30,
    region='us-west-2', 
    use_existing_cluster=True,
    adaptive_scaling=False, 
    wait_for_cluster=True, 
    propagate_env=True)

#### Open Kerchunked Dataset

In [None]:
#combined_json = f's3://esip-qhub/noaa/nwm/grid1km/combined.json'
year = '1980'
combined_json = f's3://esip-qhub/noaa/nwm/grid1km/combined_{year}.json'

In [None]:
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'anon':True}
fs = fsspec.filesystem("reference", fo=combined_json, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", chunks={'time':1, 'y':3840, 'x':4608}, 
                     backend_kwargs=dict(consolidated=False))

#### Select high-priority vars only

In [None]:
ds = ds[['ACCET', 'SNEQV', 'FSNO', 'crs']]

In [None]:
ds

In [None]:
ds['ACCET'].isel(time=slice(0,144))

In [None]:
ds.attrs

#### set up zarr stores for temporary and final zarr stores on S3

In [None]:
fs_write = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)

In [None]:
temp_name = 'esip-qhub/testing/usgs/nwm1km.tmp'
target_name = 'esip-qhub/testing/usgs/nwm1km.zarr'

In [None]:
fs_write.rm(temp_name, recursive=True)

In [None]:
fs_write.rm(target_name, recursive=True)

In [None]:
temp_store = fs_write.get_mapper(temp_name)
target_store = fs_write.get_mapper(target_name)

In [None]:
ds = ds.drop('crs')

#### Rechunk!

In [None]:
rechunked = rechunk(ds.isel(time=slice(0,144)), target_chunks={'y':96*2, 'x':132*2, 'time':144/2},
                    target_store=target_store, temp_store=temp_store, max_mem='3.5GiB')

In [None]:
%%time
rechunked.execute(retries=10)

In [None]:
zarr.convenience.consolidate_metadata(target_store)

#### Explore the rechunked dataset

In [None]:
ds2 = xr.open_dataset(target_store, engine='zarr', chunks={})

In [None]:
ds2

In [None]:
ds2.ACCET