# Efficient access to Kerchunk references
* https://discourse.pangeo.io/t/trick-for-improving-kerchunk-performance-for-large-numbers-of-chunks-files/3090
* https://github.com/fsspec/kerchunk/issues/293#issuecomment-1416290468

In [1]:
import fsspec
import xarray as xr

In [2]:
fs_http = fsspec.filesystem('https')
fs_http.download('https://gist.githubusercontent.com/agoodm/25d41ce0c47cd714271be66d0db0459d/raw/34cd54fd4a884979470e4ccd8df7ee5065c1daf9/parquet_refs.py',
                 'parquet_refs.py')

[None]

In [3]:
%run parquet_refs.py

In [4]:
fs = fsspec.filesystem('s3', anon=True, 
                        client_kwargs={'endpoint_url':'https://ncsa.osn.xsede.org'})

In [5]:
lazy_refs = 's3://esip/noaa/nwm/zarr_lazy_refs'
t_opts = {'anon': True, 'client_kwargs':{'endpoint_url':'https://ncsa.osn.xsede.org'}}

In [6]:
print(f'Number of reference files: {len(fs.ls(lazy_refs))}')
print(f'Total size of references: {fs.du(lazy_refs)/1e9} GB')

Number of reference files: 28
Total size of references: 0.756072188 GB


In [7]:
%%time

mapper = ParquetReferenceMapper(lazy_refs, fs=fs)
r_opts = {'anon': True}
fs = fsspec.filesystem('reference', fo=mapper, remote_protocol='s3', 
                       remote_options=r_opts, target_options=t_opts)
ds = xr.open_dataset(fs.get_mapper(''), engine='zarr')

CPU times: user 1.85 s, sys: 192 ms, total: 2.05 s
Wall time: 4.35 s


In [8]:
%%time
da = ds.TRAD.sel(time='1990-01-01 00:00').load()

CPU times: user 393 ms, sys: 130 ms, total: 523 ms
Wall time: 1.84 s


In [9]:
%%time
da = ds.TRAD.sel(time='2015-01-01 00:00').load()

CPU times: user 392 ms, sys: 96.5 ms, total: 489 ms
Wall time: 1.13 s


In [10]:
da.mean().data

array(266.92635398)