# Kerchunk the AORC-OWP NetCDF files

In [1]:
import fsspec
import xarray as xr
import ujson   # fast json
from kerchunk.hdf import SingleHdf5ToZarr 
from kerchunk.combine import MultiZarrToZarr
from pathlib import Path

In [2]:
fs_read = fsspec.filesystem('s3', requester_pays=True)

In [35]:
fs_write = fsspec.filesystem('s3', anon=False)

In [25]:
nc_files = fs_read.glob('s3://esip-qhub/noaa/my_ncfiles/200002/*.nc4') 

len(nc_files)

387

In [34]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 10
client,cluster = ebd.start_dask_cluster(profile=profile, worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      worker_profile='Small Worker', 
                                      propagate_env=True)

Region: us-west-2
No Cluster running.
Starting new cluster.
{}
Setting Cluster Environment Variable AWS_DEFAULT_REGION us-west-2
Setting Fixed Scaling workers=10
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://nebari.esipfed.org/gateway/clusters/dev.4eb3f3df285e4b4e90d772b2c6dc9f3b/status
Propagating environment variables to workers
Using environment: users/users-pangeo


In [6]:
json_dir = 's3://esip-qhub/noaa/AORC/jsons'

In [10]:
nc_files[0]

'esip-qhub/noaa/my_ncfiles/200002/AORC-OWP_2000020100z.nc4'

In [7]:
#cluster.close()

In [13]:
so = dict(mode='rb', requester_pays=True, default_fill_cache=False, default_cache_type='first')

In [36]:
def gen_json(u):
    with fs_read.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        fstem = Path(u).stem 
        outf = f'{json_dir}/{fstem}.json'
        print(outf)
        with fs_write.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

#### Test one file

In [15]:
gen_json(nc_files[0])

s3://esip-qhub/noaa/AORC/jsons/AORC-OWP_2000020100z.json


In [16]:
single_json = 's3://esip-qhub/noaa/AORC/jsons/AORC-OWP_2000020100z.json'

In [37]:
%%time
fs = fsspec.filesystem("reference", fo=single_json, ref_storage_args={'skip_instance_cache':True},
                       remote_protocol='s3', remote_options={'requester_pays':True})
m = fs.get_mapper("")

ds = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
print(ds)

<xarray.Dataset>
Dimensions:              (time: 1, latitude: 4201, longitude: 8401)
Coordinates:
  * latitude             (latitude) float64 20.0 20.01 20.02 ... 54.99 55.0
  * longitude            (longitude) float64 -130.0 -130.0 ... -60.01 -60.0
  * time                 (time) datetime64[ns] 2000-02-01
Data variables:
    APCP_surface         (time, latitude, longitude) float32 dask.array<chunksize=(1, 1024, 2048), meta=np.ndarray>
    DLWRF_surface        (time, latitude, longitude) float32 dask.array<chunksize=(1, 1024, 2048), meta=np.ndarray>
    DSWRF_surface        (time, latitude, longitude) float32 dask.array<chunksize=(1, 1024, 2048), meta=np.ndarray>
    PRES_surface         (time, latitude, longitude) float32 dask.array<chunksize=(1, 1024, 2048), meta=np.ndarray>
    SPFH_2maboveground   (time, latitude, longitude) float32 dask.array<chunksize=(1, 1024, 2048), meta=np.ndarray>
    TMP_2maboveground    (time, latitude, longitude) float32 dask.array<chunksize=(1, 1024, 2048

In [38]:
import dask.bag as db

In [39]:
b = db.from_sequence(nc_files, npartitions=worker_max)

In [40]:
b1 = b.map(gen_json)

In [None]:
%%time
_ = b1.compute(retries=10)

In [44]:
json_list = fs_write.glob(f'{json_dir}/*.json')  

In [45]:
print(len(json_list))

387
