## Kerchunk exploration

We can use Kerchunk to build a reference file which maps the chunking scheme of existing data sets.<br>
This allows us to access the data efficiently using zarr machinery but without actually copying the data.  The caveat is we're stuck with the native chunking and compression scheme

see: https://fsspec.github.io/kerchunk/tutorial.html

In [None]:
! pip install git+https://github.com/fsspec/kerchunk h5py

In [None]:
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
import fsspec
import xarray as xr
import ujson

In [None]:
%%time

# Establish a GCS file system to manage our file in the GCS bucket of interest
fs = fsspec.filesystem('gcs', anon=True)

# Let's checkout some existing 1C data
# flist = (fs.glob('gs://oc-flood/jma-hist-rainfall-interpolated-nc-v2/*.nc'))  # JMA
flist = (fs.glob('gs://oc-flood/jwa-historical-reanalysis-rainfall-netcdf/*.nc'))  # JWA
# flist = (fs.glob("gs://flood-pipeline/dev/data/weatherdata/gfs_3h/v3.0.0/coastal/*"))  # GFS

# You need to add the gs:// part
flist = ["gcs://" + p for p in flist]
flist[:4]

In [None]:
len(flist)

In [None]:
%%time
# Get the single jsons into a list from the file gcs urls
so = dict(
    anon=True, default_fill_cache=False, default_cache_type='first'
)
lst_singles = []
for file_url in flist:
    with fs.open(file_url, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=300)
        lst_singles.append(h5chunks.translate())

In [None]:
%%time
# Now aggregate as a multi-file into variable out
mzz = MultiZarrToZarr(
    lst_singles[61:],
    remote_protocol="gcs",
    remote_options={'anon': True},
    concat_dims=["time"],
)

out = mzz.translate()

In [None]:
# Now read in out entire list of files into one xarray dataset using the zarr engine
ds = xr.open_dataset(
    "reference://", engine="zarr",
    backend_kwargs={
        "storage_options": {
            "fo": out,
            "remote_protocol": "gcs",
            "remote_options": {"anon": True}
        },
        "consolidated": False
    }
)
ds

In [None]:
# Optionally save to local file
fs2 = fsspec.filesystem('')  #local file system to save final jsons to
with fs2.open("/Users/slamont/jwa/jwa_historical_reanalysis_rainfall_netcdf_20191101_20201231.json", 'wb') as f:
    f.write(ujson.dumps(out).encode())

#### Now we can read in the Kerchunk reference json file as an xarray dataset

In [None]:
# Read in the remote json
# sam-temp-dev/kerchunk/jma_hist_rainfall_interpolated_nc_v2_consolidated_19890101_20010331.json
# sam-temp-dev/kerchunk/jma_hist_rainfall_interpolated_nc_v2_consolidated_20020401_20051231.json
# sam-temp-dev/kerchunk/jma_hist_rainfall_interpolated_nc_v2_consolidated_20060101_20190930.json

# remote_path = "gs://sam-temp-dev/kerchunk/jma_hist_rainfall_interpolated_nc_v2_consolidated_20060101_20190930.json"  # JMA
remote_path = "gs://sam-temp-dev/kerchunk/jwa_historical_reanalysis_rainfall_netcdf_20191101_20201231.json"  # JWA

ds = xr.open_dataset(
    "reference://", engine="zarr",
    backend_kwargs={
        "storage_options": {
            "fo": remote_path,
            "remote_protocol": "gcs",
            "remote_options": {"anon": True}
        },
        "consolidated": False
    }
)
ds

In [None]:
%%time
import rioxarray

# Now we can do computations and stuff!
# ds.rainrate.sel(latitude=25.07, longitude=125.6, method="nearest").values
# ds.rainrate.sel(time="1989-01-30 12:00").plot()

ds.rainrate.sel(time=slice("2006-01-01 00:00", "2006-01-01 23:00")).max(dim="time").plot()
ds = ds.rio.write_crs(4326, inplace=True)
gdf = gpd.read_file("/Users/slamont/japan_gis/geo_boundaries_shp/cities_107_with_grid_num.geojson")
ds.rainrate.sel(time="1989-01-30 12:00").rio.clip(gdf.geometry).plot()