# Try rechunking xarray dataset to Zarr using Rechunker

In [1]:
import xarray as xr
from dask.distributed import Client, performance_report, LocalCluster
from rechunker import rechunk
import zarr

In [2]:
cluster = LocalCluster(n_workers=12, threads_per_worker=1)

In [3]:
cluster

VBox(children=(HTML(value='<h2>LocalCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

In [4]:
client = Client(cluster)

In [5]:
src = 'http://thredds.ucar.edu/thredds/dodsC/grib/NCEP/HRRR/CONUS_2p5km/Best'
temp = '/caldera/projects/usgs/hazards/cmgp/woodshole/rsignell/tmp'
target = '/caldera/projects/usgs/hazards/cmgp/woodshole/rsignell/EarthMap/Projects/de_bay/foo'

ds = xr.open_dataset(src, chunks=dict(time=1))

Initialize all data_vars as unchunked, as we will only chunk 3D and 4D variables

In [6]:
# try just one variable for now
ds = ds[['Temperature_height_above_ground']]

In [7]:
ds

Unnamed: 0,Array,Chunk
Bytes,840 B,8 B
Shape,"(105,)","(1,)"
Count,106 Tasks,105 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 840 B 8 B Shape (105,) (1,) Count 106 Tasks 105 Chunks Type datetime64[ns] numpy.ndarray",105  1,

Unnamed: 0,Array,Chunk
Bytes,840 B,8 B
Shape,"(105,)","(1,)"
Count,106 Tasks,105 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.24 GB,11.81 MB
Shape,"(105, 1, 1377, 2145)","(1, 1, 1377, 2145)"
Count,106 Tasks,105 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.24 GB 11.81 MB Shape (105, 1, 1377, 2145) (1, 1, 1377, 2145) Count 106 Tasks 105 Chunks Type float32 numpy.ndarray",105  1  2145  1377  1,

Unnamed: 0,Array,Chunk
Bytes,1.24 GB,11.81 MB
Shape,"(105, 1, 1377, 2145)","(1, 1, 1377, 2145)"
Count,106 Tasks,105 Chunks
Type,float32,numpy.ndarray


In [8]:
chunk_plan={}
for var in ds.variables:
    chunk_plan[var]=None

Specify chunks for 4D variables

In [9]:
chunks= {'time':40, 'x':300, 'y':300, 'height_above_ground1':1}
for var in ds.data_vars:
    if len(ds[var].dims)==4:
        var_chunk = {}
        for dim in ds[var].dims:
            var_chunk[dim] = chunks[dim]
        chunk_plan[var] = var_chunk

In [10]:
import shutil
try:
    shutil.rmtree(temp)
except:
    pass
try:
    shutil.rmtree(target)
except:
    pass

In [11]:
max_mem = '4GB'
array_plan = rechunk(ds, chunk_plan, max_mem, target, temp_store=temp)

AttributeError: 'Array' object has no attribute 'attrs'

In [None]:
with performance_report(filename="dask-report.html"):
    result = array_plan.execute(retries=10)

In [None]:
cluster.close()