# Try Alex Goodman's approach for efficient access to references
* https://discourse.pangeo.io/t/trick-for-improving-kerchunk-performance-for-large-numbers-of-chunks-files/3090
* https://github.com/fsspec/kerchunk/issues/293#issuecomment-1416290468

In [1]:
import fsspec
import xarray as xr

In [2]:
%run parquet_refs.py

In [14]:
fs = fsspec.filesystem('s3', anon=True, 
                        client_kwargs={'endpoint_url':'https://ncsa.osn.xsede.org'})

In [15]:
lazy_refs = 's3://esip/noaa/nwm/zarr_lazy_refs'
t_opts = {'anon': True, 'client_kwargs':{'endpoint_url':'https://ncsa.osn.xsede.org'}}

In [16]:
print(f'Number of reference files: {len(fs.ls(lazy_refs))}')
print(f'Total size of references: {fs.du(lazy_refs)/1e9} GB')

Number of reference files: 28
Total size of references: 0.756072188 GB


In [19]:
lazy_refs = './refs_test'

In [20]:
%%time
mapper = ParquetReferenceMapper(lazy_refs)
r_opts = {'anon': True}
fs = fsspec.filesystem('reference', fo=mapper, remote_protocol='s3', 
                       remote_options=r_opts)
ds = xr.open_dataset(fs.get_mapper(''), engine='zarr')

CPU times: user 1.68 s, sys: 233 ms, total: 1.91 s
Wall time: 6.09 s


In [21]:
%%time
da = ds.TRAD.sel(time='1990-01-01 00:00').load()

CPU times: user 390 ms, sys: 101 ms, total: 491 ms
Wall time: 1.33 s


In [22]:
%%time
da = ds.TRAD.sel(time='2015-01-01 00:00').load()

CPU times: user 365 ms, sys: 102 ms, total: 467 ms
Wall time: 799 ms


In [23]:
da.mean().data

array(266.92635398)