# Examples of extend_zarr()

`extend_zarr()` function add a `xarray.DataArray` to write data to existing zarr file which containing a dataarray.

The differences from the `append_dim` option of `xarray.DataArray.to_zarr()` are as follows

- Data can be written even if the lengths of multiple coordinates do not match. (Example 2) 
- Data can be written to non-contiguous coordinates. (Example 3)
- Data with matching coordinate updates existing data in the zarr file. (Example 4)

Limitations:
- Coordinates needs to be unique 1D arrays.
- Dataarray can only be appended to tail of coordinates in the zarr store.

In [1]:
from pathlib import Path

import dask.array as da
import numpy as np
import xarray as xr

from extend_zarr import extend_zarr

In [2]:
def init_sample_zarr(path: Path = Path('/tmp/test.zarr')):
    # Create a 4D array with random data
    rng = da.random.default_rng(1111)
    coords = {
        'x': np.arange(10_000, 10_010, dtype=np.int32),
        'y': np.arange(20_000, 20_010, dtype=np.int32),
        'band': ['red', 'green', 'blue'],
        'year': [2012, 2013],
    }
    chunks = ((10,), (5, 5), (2, 1), (1, 1))
    shape = tuple(len(c) for c in coords.values())
    array = rng.integers(0, 100, np.prod(shape)).reshape(shape).rechunk(chunks)
    # Create a zarr file
    xr.DataArray(array, dims=coords.keys(), coords=coords).to_dataset(
        name='value'
    ).to_zarr(path, mode='w')
    return path


path = init_sample_zarr()

In [3]:
xr.open_zarr(path).value

Unnamed: 0,Array,Chunk
Bytes,4.69 kiB,800 B
Shape,"(10, 10, 3, 2)","(10, 5, 2, 1)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 4.69 kiB 800 B Shape (10, 10, 3, 2) (10, 5, 2, 1) Dask graph 8 chunks in 2 graph layers Data type int64 numpy.ndarray",10  1  2  3  10,

Unnamed: 0,Array,Chunk
Bytes,4.69 kiB,800 B
Shape,"(10, 10, 3, 2)","(10, 5, 2, 1)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


## Example 1

Add a DataArray to Zarr.

![test_dataarray.png](img/test_dataarray.png)

In [4]:
new_coords = {
    'x': np.arange(10_000, 10_010, dtype=np.int32),
    'y': np.arange(20_000, 20_010, dtype=np.int32),
    'band': ['red', 'green', 'blue'],
    'year': [2015, 2016],
}
shape = tuple(len(c) for c in new_coords.values())
new_array = da.arange(int(np.prod(shape))).reshape(shape)
new_dataarray = xr.DataArray(
    data=new_array,
    dims=new_coords.keys(),
    coords=new_coords,
)

In [5]:
path = init_sample_zarr()

extend_zarr(dataarray=new_dataarray, store=path, var_name='value')

xr.open_zarr(path).value

Unnamed: 0,Array,Chunk
Bytes,9.38 kiB,800 B
Shape,"(10, 10, 3, 4)","(10, 5, 2, 1)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 9.38 kiB 800 B Shape (10, 10, 3, 4) (10, 5, 2, 1) Dask graph 16 chunks in 2 graph layers Data type int64 numpy.ndarray",10  1  4  3  10,

Unnamed: 0,Array,Chunk
Bytes,9.38 kiB,800 B
Shape,"(10, 10, 3, 4)","(10, 5, 2, 1)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [6]:
xr.open_zarr(path).value.isel(x=0, y=0).values

array([[ 9, 49,  0,  1],
       [58, 74,  2,  3],
       [ 6, 65,  4,  5]])

## Example 2

Add a DataArray with multiple unmatched coordinates

![test_dataarray.png](img/test_multi_nonaligned_middle.png)

In [7]:
new_coords = {
    'x': np.arange(10_000, 10_010, dtype=np.int32),
    'y': np.arange(20_000, 20_010, dtype=np.int32),
    'band': ['green'],
    'year': [2015, 2016],
}
shape = tuple(len(c) for c in new_coords.values())
new_array = da.arange(int(np.prod(shape))).reshape(shape)
new_dataarray = xr.DataArray(
    data=new_array,
    dims=new_coords.keys(),
    coords=new_coords,
)

In [8]:
path = init_sample_zarr()

extend_zarr(
    dataarray=new_dataarray, store=path, var_name='value', fill_value=-9999
)

xr.open_zarr(path).value

Unnamed: 0,Array,Chunk
Bytes,9.38 kiB,800 B
Shape,"(10, 10, 3, 4)","(10, 5, 2, 1)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 9.38 kiB 800 B Shape (10, 10, 3, 4) (10, 5, 2, 1) Dask graph 16 chunks in 2 graph layers Data type int64 numpy.ndarray",10  1  4  3  10,

Unnamed: 0,Array,Chunk
Bytes,9.38 kiB,800 B
Shape,"(10, 10, 3, 4)","(10, 5, 2, 1)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [9]:
xr.open_zarr(path).value.isel(x=0, y=0).values

array([[    9,    49, -9999, -9999],
       [   58,    74,     0,     1],
       [    6,    65, -9999, -9999]])

## Example 3

Add a DataArray with multiple unmatched coordinates

![test_dataarray.png](img/test_coord_option.png)

In [10]:
new_coords = {
    'x': np.arange(10_000, 10_010, dtype=np.int32),
    'y': np.arange(20_000, 20_010, dtype=np.int32),
    'band': ['red', 'green'],
    'year': [2015, 2016],
}
shape = tuple(len(c) for c in new_coords.values())
new_array = da.arange(int(np.prod(shape))).reshape(shape)
new_dataarray = xr.DataArray(
    data=new_array,
    dims=new_coords.keys(),
    coords=new_coords,
)

In [11]:
path = init_sample_zarr()

extend_zarr(
    dataarray=new_dataarray,
    store=path,
    var_name='value',
    fill_value=-9999,
    coords={'year': np.array([2012, 2013, 2014, 2015, 2016])},
)

xr.open_zarr(path).value

Unnamed: 0,Array,Chunk
Bytes,11.72 kiB,800 B
Shape,"(10, 10, 3, 5)","(10, 5, 2, 1)"
Dask graph,20 chunks in 2 graph layers,20 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 11.72 kiB 800 B Shape (10, 10, 3, 5) (10, 5, 2, 1) Dask graph 20 chunks in 2 graph layers Data type int64 numpy.ndarray",10  1  5  3  10,

Unnamed: 0,Array,Chunk
Bytes,11.72 kiB,800 B
Shape,"(10, 10, 3, 5)","(10, 5, 2, 1)"
Dask graph,20 chunks in 2 graph layers,20 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [12]:
xr.open_zarr(path).value.isel(x=0, y=0).values

array([[    9,    49, -9999,     0,     1],
       [   58,    74, -9999,     2,     3],
       [    6,    65, -9999, -9999, -9999]])

## Example 4

Add a DataArray with multiple unmatched coordinates

![test_dataarray.png](img/test_overwrite.png)

In [13]:
new_coords = {
    'x': np.arange(10_000, 10_010, dtype=np.int32),
    'y': np.arange(20_000, 20_010, dtype=np.int32),
    'band': ['green', 'blue'],
    'year': [2013, 2014],
}
shape = tuple(len(c) for c in new_coords.values())
new_array = da.arange(int(np.prod(shape))).reshape(shape)
new_dataarray = xr.DataArray(
    data=new_array,
    dims=new_coords.keys(),
    coords=new_coords,
)

In [14]:
path = init_sample_zarr()

extend_zarr(
    dataarray=new_dataarray,
    store=path,
    var_name='value',
    fill_value=-9999,
)

xr.open_zarr(path).value

Unnamed: 0,Array,Chunk
Bytes,7.03 kiB,800 B
Shape,"(10, 10, 3, 3)","(10, 5, 2, 1)"
Dask graph,12 chunks in 2 graph layers,12 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.03 kiB 800 B Shape (10, 10, 3, 3) (10, 5, 2, 1) Dask graph 12 chunks in 2 graph layers Data type int64 numpy.ndarray",10  1  3  3  10,

Unnamed: 0,Array,Chunk
Bytes,7.03 kiB,800 B
Shape,"(10, 10, 3, 3)","(10, 5, 2, 1)"
Dask graph,12 chunks in 2 graph layers,12 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [15]:
xr.open_zarr(path).value.isel(x=0, y=0).values

array([[    9,    49, -9999],
       [   58,     0,     1],
       [    6,     2,     3]])