# Kerchunk the AORC-OWP NetCDF files

In [1]:
import fsspec
import xarray as xr
import ujson   # fast json
from kerchunk.hdf import SingleHdf5ToZarr 
from kerchunk.combine import MultiZarrToZarr
from pathlib import Path

In [2]:
fs_read = fsspec.filesystem('s3', requester_pays=True)

In [3]:
fs_write = fsspec.filesystem('s3', anon=False)

In [4]:
nc_files = fs_read.glob('s3://esip-qhub/noaa/my_ncfiles/200002/*.nc4') 

len(nc_files)

387

In [21]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 10
client,cluster = ebd.start_dask_cluster(profile=profile, worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      worker_profile='Small Worker', 
                                      propagate_env=True)

Region: us-west-2
No Cluster running.
Starting new cluster.
{}
Setting Cluster Environment Variable AWS_DEFAULT_REGION us-west-2
Setting Fixed Scaling workers=10
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://nebari.esipfed.org/gateway/clusters/dev.afae234cd3e54864aeaefb6ef8808669/status
Propagating environment variables to workers
Using environment: users/users-pangeo


In [5]:
json_dir = 's3://esip-qhub/noaa/AORC/jsons'

In [6]:
nc_files[0]

'esip-qhub/noaa/my_ncfiles/200002/AORC-OWP_2000020100z.nc4'

In [None]:
#cluster.close()

In [7]:
def gen_json(u):
    with fs_read.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        fstem = Path(u).stem 
        outf = f'{json_dir}/{fstem}.json'
        print(outf)
        with fs_write.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [8]:
so = dict(mode='rb', requester_pays=True, default_fill_cache=False, default_cache_type='first')

#### Test one file

In [10]:
%%time 
gen_json(nc_files[0])

s3://esip-qhub/noaa/AORC/jsons/AORC-OWP_2000020100z.json
CPU times: user 592 ms, sys: 117 ms, total: 709 ms
Wall time: 1.56 s


In [11]:
single_json = 's3://esip-qhub/noaa/AORC/jsons/AORC-OWP_2000020100z.json'

In [12]:
%%time
fs = fsspec.filesystem("reference", fo=single_json, ref_storage_args={'skip_instance_cache':True},
                       remote_protocol='s3', remote_options={'requester_pays':True})

CPU times: user 97.2 ms, sys: 25.6 ms, total: 123 ms
Wall time: 353 ms


In [None]:
fs.ls('PRES_surface')

In [15]:
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})

In [17]:
ds

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 134.63 MiB 8.00 MiB Shape (1, 4201, 8401) (1, 1024, 2048) Dask graph 25 chunks in 2 graph layers Data type float32 numpy.ndarray",8401  4201  1,

Unnamed: 0,Array,Chunk
Bytes,134.63 MiB,8.00 MiB
Shape,"(1, 4201, 8401)","(1, 1024, 2048)"
Dask graph,25 chunks in 2 graph layers,25 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


#### Use Dask bag to compute the JSONs in parallel

In [19]:
import dask.bag as db

In [22]:
b = db.from_sequence(nc_files, npartitions=worker_max)

In [24]:
b1 = b.map(gen_json)

In [26]:
%%time
_ = b1.compute(retries=10)

CPU times: user 152 ms, sys: 39.6 ms, total: 192 ms
Wall time: 2min 10s


In [27]:
json_list = fs_write.glob(f'{json_dir}/*.json')  

In [28]:
print(len(json_list))

387


Exception in callback None()
handle: <Handle cancelled>
Traceback (most recent call last):
  File "/home/conda/users/def60d898570988b3144fa6932cb0087c18afd4f3261c08887f846a32495ba96-20230328-131936-847877-164-pangeo/lib/python3.10/site-packages/tornado/iostream.py", line 1389, in _do_ssl_handshake
    self.socket.do_handshake()
  File "/home/conda/users/def60d898570988b3144fa6932cb0087c18afd4f3261c08887f846a32495ba96-20230328-131936-847877-164-pangeo/lib/python3.10/ssl.py", line 1342, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate (_ssl.c:997)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/conda/users/def60d898570988b3144fa6932cb0087c18afd4f3261c08887f846a32495ba96-20230328-131936-847877-164-pangeo/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  