# Concat Operator with caching

* https://grantjenks.com/docs/diskcache/
* https://joblib.readthedocs.io/en/latest/index.html
* https://xarray.pydata.org/en/stable/generated/xarray.tutorial.open_dataset.html#xarray.tutorial.open_dataset
* https://www.fatiando.org/pooch/latest/
* https://pypi.org/project/appdirs/

In [1]:
import xarray as xr
import pandas as pd
from pathlib import Path
from diskcache import Cache
import tempfile
import time

In [2]:
def _concat(dataset, dim=None):
    data_dir = Path(dataset)
    paths = sorted(data_dir.glob("**/*.nc"))

    # aggregation
    ds_avg = None
    with xr.open_mfdataset(
        paths,
        concat_dim=dim,
        combine="nested",
        chunks={dim: 10, "time": 10},
        # parallel=True,
        # preprocess=lambda ds: ds.isel(time=0)
    ) as ds:
        # average
        ds_avg = ds.mean(dim="realization", skipna=True, keep_attrs=True)
    return ds_avg

In [3]:
def concat(dataset, dim=None, cache=True, cache_dir=None):
    dim = dim or "realization"
    dcache = Cache(cache_dir)
    key = f"concat_{dim}_{dataset}"
    print(key)
    print("cache stats: ", dcache.stats(enable=True))
    if cache and key in dcache:
        print("use cache")
        ds_path = dcache.get(key)
        ds = xr.open_dataset(ds_path)
    else:
        print("concat")
        ds = _concat(dataset, dim)
        tempdir = tempfile.mkdtemp(dir=dcache.directory, prefix="concat_") 
        ds_path = Path(f"{tempdir}/out.nc").as_posix()
        print("write netcdf", ds_path)
        ds.to_netcdf(ds_path)
        print("add to cache")
        dcache.set(key, ds_path, expire=None, tag="concat")
    return ds 

In [4]:
start = time.time()
dataset = "/Users/pingu/data/cmip6-decadal/orig/day"
ds = concat(dataset, cache_dir="/Users/pingu/data/cache")
print(f"{time.time() - start} secs")

concat_realization_/Users/pingu/data/cmip6-decadal/orig/day
cache stats:  (2, 0)
use cache
0.3773081302642822 secs


In [5]:
ds