# Rechunk the kerchunked dataset

In [None]:
import fsspec
import fsspec.implementations.reference
import zarr
import xarray as xr
from pathlib import Path

from rechunker import rechunk

In [None]:
import rechunker
rechunker.__version__

In [None]:
import zarr
zarr.__version__

In [None]:
#client.close(); cluster.close()

#### Start a Dask Gateway cluster
Use a custom helper function `ebd.start_dask_cluster` to set options on this cluster.  We don't have to use this helper, it just cuts down on lines of code in notebooks.

In [None]:
from dask.distributed import LocalCluster, Client

cluster = LocalCluster(threads_per_worker=1, n_workers=40)
client = Client(cluster)
client

#### Open Combined Kerchunked Dataset

In [None]:
combined_json = '/caldera/hytest_scratch/scratch/rsignell/sudhir.json'

In [None]:
%%time
fs = fsspec.filesystem("reference", fo=combined_json, ref_storage_args={'skip_instance_cache':True})
m = fs.get_mapper("")

ds = xr.open_dataset(m, engine="zarr", mask_and_scale=False,
                     backend_kwargs={'consolidated':False}, chunks={})

In [None]:
ds

In [None]:
ds = ds.isel(time=slice(0,144))

In [None]:
ds

In [None]:
ds.nbytes/1e9

#### set up zarr stores for temporary and final zarr stores on S3

In [None]:
fs_write = fsspec.filesystem('file')

In [None]:
temp_name = '/caldera/hytest_scratch/scratch/rsignell/AORC/rechunk/test01.tmp'
target_name = '/caldera/hytest_scratch/scratch/rsignell/AORC/rechunk/test01.zarr'

In [None]:
temp_store = fs_write.get_mapper(temp_name)
target_store = fs_write.get_mapper(target_name)

In [None]:
try:
    fs_write.rm(temp_name, recursive=True)
except:
    pass
try:
    fs_write.rm(target_name, recursive=True)
except:
    pass

In [None]:
nt = len(ds.time)
nt = 365*24*30

In [None]:
a = int(nt)/144
a

In [None]:
b = (len(ds.longitude) * len(ds.latitude))/((2*200)*(2*200))
b

In [None]:
a/b

In [None]:
2.8/4

In [None]:
#client.close()

In [None]:
#from dask.distributed import Client

In [None]:
#client = Client(threads_per_worker=1)

In [None]:
#client.amm.start()

In [None]:
import zarr.storage
from numcodecs import Zstd
zarr.storage.default_compressor = Zstd(level=9)

In [None]:
(200*200*144)*2/1e6

In [None]:
190/40*.7

In [None]:
chunks={'latitude':400, 'longitude':400, 'time':168}
verbose=True
mem = '3.0GB'
group_chunks = {}
# newer tuple version that also takes into account when specified chunks are larger than the array
for var in ds.variables:
    # pick appropriate chunks from above, and default to full length chunks for dimensions that are not in `chunks` above.
    group_chunks[var] = []
    for di in ds[var].dims:
        if di in chunks.keys():
            if chunks[di] > len(ds[di]):
                group_chunks[var].append(len(ds[di]))
            else:
                group_chunks[var].append(chunks[di])

        else:
            group_chunks[var].append(len(ds[di]))

    group_chunks[var] = tuple(group_chunks[var])
if verbose:
    print(f"Rechunking to: {group_chunks}")
    print(f"mem:{mem}")


#### Rechunk!

In [None]:
%%time
rechunked = rechunker.rechunk(ds, target_chunks=group_chunks, max_mem=mem,
                              target_store=target_store, temp_store=temp_store)
rechunked.execute(retries=10)

In [None]:
zarr.convenience.consolidate_metadata(target_store)

#### Explore the rechunked dataset

In [None]:
ds2 = xr.open_dataset(target_store, engine='zarr', chunks={})

In [None]:
ds2.nbytes/1e9   #uncompressed data size in GB

In [None]:
ds2

In [None]:
ds2.APCP_surface

In [None]:
import hvplot.xarray

In [None]:
%%time
da = ds.APCP_surface.sel(longitude=-115.18, latitude=46.65, method='nearest').load()

In [None]:
da.hvplot(x='time', grid=True)

In [None]:
cluster.close()

In [None]:
client.close()

In [None]:
client