## Reading zarr consolidated metadata depends on the number of chunks

In [4]:
import os
import glob
import numpy as np
import xarray as xr
import dask.array as da
import timeit

### Functions to write and read dummy zarr stores

In [2]:
NX = NY = NT = 20
NV = 8

def create_store(chunksize, store):
    print(f"Writing zarr store with chunksize={chunksize}")

    dset = xr.Dataset()
    darr = xr.DataArray(
        data=da.zeros((NT, NY, NX), chunks=3 * (chunksize,)),
        coords={"time": np.arange(NT), "lat": np.arange(NY), "lon": np.arange(NX)},
        dims=("time", "lat", "lon"),
    )
    for n in range(1, NV + 1):
        dset[f"v{n}"] = darr
    dset.to_zarr(store, consolidated=True, mode="w")


def read_metadata(store):
    dset = xr.open_zarr(store, consolidated=True)
    return dset

### Create zarr stores with varying chunksizes

In [5]:
chunksizes = [1, 2, 20]
stores = [f"store-chunk-{chunksize:02.0f}" for chunksize in chunksizes]

for chunksize, store in zip(chunksizes, stores):
    if not os.path.isdir(store):
        create_store(chunksize, store)

read_metadata(store)

Writing zarr store with chunksize=1
Writing zarr store with chunksize=2
Writing zarr store with chunksize=20


Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 64.00 kB 64.00 kB Shape (20, 20, 20) (20, 20, 20) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",20  20  20,

Unnamed: 0,Array,Chunk
Bytes,64.00 kB,64.00 kB
Shape,"(20, 20, 20)","(20, 20, 20)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray


### Read metadata from each store 50 times

In [6]:
nchunks = [NV * NT * NY * NX / (chunksize**3) for chunksize in chunksizes]

results = {}
for store, nchunk in zip(stores, nchunks):
    result = timeit.timeit(f"read_metadata('{store}')", setup="from __main__ import read_metadata", number=50)
    results.update({f"{nchunk:0.0f} chunks": f"{result:0.2f}s"})
results

{'64000 chunks': '1.08s', '8000 chunks': '0.31s', '8 chunks': '0.21s'}

## Profiling

### Fast store

In [28]:
%prun -l 10 read_metadata("store-chunk-20")

 

         17463 function calls (17119 primitive calls) in 0.020 seconds

   Ordered by: internal time
   List reduced from 472 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      262    0.001    0.000    0.002    0.000 indexing.py:556(shape)
2931/2906    0.001    0.000    0.001    0.000 {built-in method builtins.isinstance}
        6    0.000    0.000    0.000    0.000 {built-in method posix.stat}
      135    0.000    0.000    0.003    0.000 variable.py:308(__init__)
       98    0.000    0.000    0.001    0.000 util.py:277(normalize_storage_path)
       24    0.000    0.000    0.001    0.000 core.py:2434(normalize_chunks)
       16    0.000    0.000    0.000    0.000 inspect.py:2760(__init__)
       11    0.000    0.000    0.004    0.000 conventions.py:262(decode_cf_variable)
1619/1577    0.000    0.000    0.000    0.000 {built-in method builtins.len}
       16    0.000    0.000    0.001    0.000 inspect.py:2112(_signature_from_

### Slow store

In [29]:
%prun -l 10 read_metadata("store-chunk-01")


 

         20623 function calls (20279 primitive calls) in 0.172 seconds

   Ordered by: internal time
   List reduced from 472 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        8    0.066    0.008    0.066    0.008 core.py:259(<listcomp>)
        8    0.064    0.008    0.141    0.018 core.py:222(getem)
        8    0.007    0.001    0.008    0.001 core.py:203(slices_from_chunks)
        1    0.005    0.005    0.173    0.173 <string>:1(<module>)
      262    0.002    0.000    0.003    0.000 indexing.py:556(shape)
        8    0.001    0.000    0.145    0.018 core.py:2760(from_array)
2931/2906    0.001    0.000    0.001    0.000 {built-in method builtins.isinstance}
       98    0.001    0.000    0.001    0.000 util.py:277(normalize_storage_path)
        4    0.001    0.000    0.001    0.000 {built-in method io.open}
      135    0.001    0.000    0.003    0.000 variable.py:556(_parse_dimensions)