# Earthmover HH Lightning Talk: Zarr-Python 3 demo

<img src="https://raw.githubusercontent.com/zarr-developers/zarr-logo/main/zarr-pink-stacked.svg" alt="drawing" width="250"/>




In [1]:
import asyncio
import numpy as np
import tempfile
# from directory_tree import display_tree
from pprint import pprint

import zarr
from zarr.buffer import default_buffer_prototype

In [2]:
zarr.__version__

'3.0.0a0'

## The basics

Most things will feel the same.

- as much as possible, we're keeping the top level API the same
- expect some deprecations to parts of the v2 API
- some folks will notice changes to the Store API.

In [3]:
# create a store
store = zarr.store.LocalStore(
    root=tempfile.mkdtemp(),
    mode='w',                              # new in 3.0: store open modes
)

# create a root group
root = zarr.group(
    store=store,                            
    zarr_format=3,                         # new in 3.0: support for v2 and v3 specs
    attributes={"name": "demo root group"} # new in 3.0: pass attributes at creation time
)

In [None]:
# create a store
store = zarr.store.LocalStore(
    root=tempfile.mkdtemp(),
    mode='w',                               # new in 3.0: store's are opened in read or write mode
)

# create a root group
root = zarr.group(
    store=store,                            # new in 3.0: transition to keyword only constructors (wip)
    zarr_format=3,                          # new in 3.0: support for v2 and v3 specs
    attributes={"name": "demo root group"}  # new in 3.0: pass attributes at group creation time
)

In [4]:
# create an array
arr = root.create_array(
    name="foo",
    shape=(10, 10),
    chunks=(5, 5),
    dtype='i4',
    dimension_names=('x', 'y'),  # new in 3.0: support array dimension names (v3 only)
    attributes={'units': 'foo'}  # new in 3.0: pass attributes at array creation time
)

# update an attribute
arr.attrs['name'] = 'foo-array'

# and write data to it
arr[:] = np.random.randint(0, 10, size=arr.shape)

In [5]:
# v3 spec store layout / metadata keys / chunk key encoding
display_tree(store.root)

tmp9jxsaz9c/
├── foo/
│   ├── c/
│   │   ├── 0/
│   │   │   ├── 0
│   │   │   └── 1
│   │   └── 1/
│   │       ├── 0
│   │       └── 1
│   └── zarr.json
└── zarr.json


## Some new things

In [None]:
# create a sharded array
from zarr.codecs import ShardingCodec, TransposeCodec, BytesCodec, BloscCodec

In [6]:
sharded_arr = root.create_array(
    name="bar",
    shape=(100, 100),
    chunks=(50, 50),
    dtype='i4',
    dimension_names=('x', 'y'),    # new in 3.0: support array dimension names (v3 only)
    attributes={'units': 'bar'},   # new in 3.0: pass attributes at array creation time
    codecs=[                       # new in 3.0: codec pipelines
        zarr.codecs.ShardingCodec( # new in 3.0: sharding codec
            chunk_shape=(5, 5),
            codecs=[
                zarr.codecs.TransposeCodec(order=(0, 1)),
                zarr.codecs.BytesCodec(),
                zarr.codecs.BloscCodec(cname="lz4"),
            ],
            index_location="start",
        )
    ],
)

# and write data to it
sharded_arr[:] = np.random.randint(0, 10, size=shard_arr.shape)

In [7]:
# bar/c only has 4 objects despite there being 400 chunks!
display_tree(store.root)

tmp9jxsaz9c/
├── bar/
│   ├── c/
│   │   ├── 0/
│   │   │   ├── 0
│   │   │   └── 1
│   │   └── 1/
│   │       ├── 0
│   │       └── 1
│   └── zarr.json
├── foo/
│   ├── c/
│   │   ├── 0/
│   │   │   ├── 0
│   │   │   └── 1
│   │   └── 1/
│   │       ├── 0
│   │       └── 1
│   └── zarr.json
└── zarr.json


In [8]:
# the store interface is entirely async... `await` everything
# get a metadata object out of the store
(await store.get("foo/zarr.json", prototype=default_buffer_prototype)).to_bytes()

b'{"shape": [10, 10], "fill_value": 0, "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [5, 5]}}, "attributes": {"units": "foo", "name": "foo-array"}, "zarr_format": 3, "data_type": "int32", "chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}}, "codecs": [{"name": "bytes", "configuration": {"endian": "little"}}], "dimension_names": ["x", "y"], "node_type": "array"}'

In [9]:
# new metadata DataClasses / property
pprint(root.metadata)
pprint(arr.metadata)
# pprint(shard_arr.metadata)

GroupMetadata(attributes={'name': 'demo root group'},
              zarr_format=3,
              node_type='group')
ArrayV3Metadata(shape=(10, 10),
                fill_value=0,
                chunk_grid=RegularChunkGrid(chunk_shape=(5, 5)),
                attributes={'units': 'foo'},
                zarr_format=3,
                data_type=dtype('int32'),
                chunk_key_encoding=DefaultChunkKeyEncoding(name='default',
                                                           separator='/'),
                codecs=BatchedCodecPipeline(array_array_codecs=(),
                                            array_bytes_codec=BytesCodec(endian=<Endian.little: 'little'>),
                                            bytes_bytes_codecs=(),
                                            batch_size=1),
                dimension_names=('x', 'y'),
                node_type='array')


In [10]:
# 100% type hint coverage
zarr.Array.create?

[0;31mSignature:[0m
[0mzarr[0m[0;34m.[0m[0mArray[0m[0;34m.[0m[0mcreate[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mstore[0m[0;34m:[0m [0;34m'StoreLike'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshape[0m[0;34m:[0m [0;34m'ChunkCoords'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'npt.DTypeLike'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mzarr_format[0m[0;34m:[0m [0;34m'ZarrFormat'[0m [0;34m=[0m [0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfill_value[0m[0;34m:[0m [0;34m'Any | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mattributes[0m[0;34m:[0m [0;34m'dict[str, JSON] | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchunk_shape[0m[0;34m:[0m [0;34m'ChunkCoords | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchunk_key_encoding[0m[0;34m:[0m [0;34m"Chun

In [11]:
# new global config (uses donfig)
with zarr.config.set({'codec_pipeline.batch_size': 4}):
    zarr.config.pprint()

{'array': {'order': 'C'},
 'async': {'concurrency': None, 'timeout': None},
 'codec_pipeline': {'batch_size': 4}}


## New Async API

Zarr-Python 3 will include an AsyncIO interface. Why? We want to take advantage of concurrency everywhere we can.

In [13]:
# create a store
store = zarr.store.LocalStore(
    root=tempfile.mkdtemp(),
    mode='w'
)

# create a root group
root = await zarr.api.asynchronous.group(
    store=store,                            # new in 3.0: transition to keyword only constructors (wip)
    zarr_format=3,                          # new in 3.0: support for v2 and v3 specs
    attributes={"name": "demo root group"}  # new in 3.0: pass attributes at group creation time
)
root

<AsyncGroup file:///tmp/tmpb_sv9ysd>

In [13]:
# create 5 arrays concurrently
awaitables = []
for name in ["foo", "bar", "spam", "baz", "qux"]:
    awaitables.append(
        root.create_array(
            path=name,
            shape=(10, 10),
            chunks=(5, 5),
            dtype='i4',
            attributes={'title': f'{name} demo'}
        )
    )
arrays = await asyncio.gather(*awaitables)

In [14]:
keys = [k async for k in root.array_keys()]
keys

['foo', 'bar', 'spam', 'baz', 'qux']

In [15]:
# now load all these arrays concurrently
await asyncio.gather(*[root.getitem(k) for k in keys])

[<AsyncArray file:///tmp/tmpd30e2e5g/foo shape=(10, 10) dtype=int32>,
 <AsyncArray file:///tmp/tmpd30e2e5g/bar shape=(10, 10) dtype=int32>,
 <AsyncArray file:///tmp/tmpd30e2e5g/spam shape=(10, 10) dtype=int32>,
 <AsyncArray file:///tmp/tmpd30e2e5g/baz shape=(10, 10) dtype=int32>,
 <AsyncArray file:///tmp/tmpd30e2e5g/qux shape=(10, 10) dtype=int32>]

In [None]:
# we can read/write data using the asyncio interface too
arr = arrays[0]

In [18]:
arr = await root.create_array('foo/bar', shape=(10, 10), chunks=(5, 5), dtype='i4')  

data = np.random.randint(0, 10, size=(10, 10))
await arr.setitem(slice(None), data)

part = await arr.getitem((slice(5), slice(3)))
part

array([[2, 6, 5],
       [5, 2, 1],
       [2, 5, 6],
       [6, 9, 0],
       [4, 2, 3]], dtype=int32)

In [17]:
# rather than the __getitem__ syntax, we use the getitem method
await arr.getitem((slice(5), slice(3)))

array([[9, 0, 9],
       [3, 7, 1],
       [5, 0, 5],
       [9, 7, 1],
       [4, 7, 3]], dtype=int32)

In [18]:
display_tree(store.root)

tmpd30e2e5g/
├── bar/
│   └── zarr.json
├── baz/
│   └── zarr.json
├── foo/
│   ├── c/
│   │   ├── 0/
│   │   │   ├── 0
│   │   │   └── 1
│   │   └── 1/
│   │       ├── 0
│   │       └── 1
│   └── zarr.json
├── qux/
│   └── zarr.json
├── spam/
│   └── zarr.json
└── zarr.json


In [6]:
import numpy as np
import zarr

store = zarr.store.RemoteStore('s3://my-bucket/example.zarr', mode='w')
root = zarr.group(store=store)
z = root.create_array('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4')
z[:] = np.random.randint(-100, 100, size=(1000, 1000))
