# Generate references for HYCOM using kerchunk
HYCOM data on AWS Open Data are stored in 63,341 NetCDF 64-bit offset files. The data in these files are stored as short integers with scale_factor and add_offset, but because these are not NetCDF4 files, there is no compression and no chunking. Each file contains one time step of data. 

We generate references for each file, and use kerchunk.utils.subchunk to create virtual chunks so that each vertical layer is treated as a chunk.  For this uncompressed data, the byte-ranges are the same for each file, so we only need to create references for one file and then clone that for all the files, changing only the URL and the time value. 

In [None]:
from kerchunk.netCDF3 import NetCDF3ToZarr
from kerchunk.combine import MultiZarrToZarr, auto_dask, JustLoad
from kerchunk.utils import subchunk, inline_array
from fsspec.implementations.reference import LazyReferenceMapper
import fsspec
import xarray as xr
import datetime as dt
import copy
import kerchunk
import base64
import struct
import numpy as np

In [None]:
fs = fsspec.filesystem('s3', anon=True)

In [None]:
%%time
flist = fs.glob('hycom-gofs-3pt1-reanalysis/*/*.nc')

In [None]:
len(flist)

In [None]:
flist[0]

Method to generate references.   Need only to use for the first file!

In [None]:
%%time
d0 = NetCDF3ToZarr("s3://" + flist[0], storage_options={"anon": True},
                  inline_threshold=400, version=2).translate()

Subchunk the 4D data vars

In [None]:
for v in ['salinity', 'water_temp', 'water_u', 'water_v']:
    d0 = subchunk(store=d0, variable=v, factor=40)

#### Open the references for the first file in the dataset

Storage options (for accessing the NetCDF files from AWS)

In [None]:
so = dict(anon=True, skip_instance_cache=True)

In [None]:
ds = xr.open_dataset(d0, engine='kerchunk', chunks={}, drop_variables='tau', 
                     backend_kwargs=dict(storage_options=dict(
                    remote_protocol='s3', lazy=False, remote_options=so)))

In [None]:
ds

Define some functions to replace all the URLs in the reference dict

In [None]:
def float_to_base64(number):
    # Pack the float into bytes
    packed = struct.pack('>d', number)
    
    # Encode the bytes to base64
    encoded = base64.b64encode(packed)
    # Convert bytes to string and return
    return encoded.decode('utf-8')

# Example usage
float_num = 122748.
encoded_str = float_to_base64(float_num)
print(f"Original number: {float_num}")
print(f"Base64 encoded: {encoded_str}")

In [None]:
def replace_first_item(d, target_string, replacement_string):
    for key, value in d.items():
        if isinstance(value, dict):
            # Recursively process nested dictionaries
            replace_first_item(value, target_string, replacement_string)
        elif isinstance(value, list) and value and isinstance(value[0], str):
            # Check if the value is a non-empty list and the first item is a string
            if value[0] == target_string:
                value[0] = replacement_string
    return d
#replace_first_item(d, f's3://{flist[0]}', f's3://{flist[1]}')

Function to generate the time from the filename

In [None]:
def name2date(f):
    year = f[51:55]
    month = f[55:57]
    day = f[57:59]
#    hour = f[59:61]  #always 12 for this dataset
    tau = f[64:66]
    return dt.datetime(int(year), int(month), int(day), int(tau))

Loop through all the files, generating the references for each file by replacing the URL and date in the reference dict template

In [None]:
%%time
dlist = []
time0 = dt.datetime(2000,1,1,0)
for i,v in enumerate(flist):
    dmod = copy.deepcopy(d0)
    time1 = name2date(v) + dt.timedelta(hours=12)
    time_val = (time1 - time0).total_seconds()/3600 
    encoded_str = float_to_base64(time_val)
    dmod['time/0'] = f'base64:{encoded_str}'
    dmod = replace_first_item(dmod, f's3://{flist[0]}',f's3://{v}')
    dlist.append(dmod)

Generate the combined references and save to Parquet storage

In [None]:
combined_parquet = 'hycom.parq'

In [None]:
out = LazyReferenceMapper.create(combined_parquet, fs=None, record_size=100000)

In [None]:
%%time
_ = MultiZarrToZarr(
        dlist,
        remote_protocol="s3",
        concat_dims="time",
        identical_dims=['lon', 'lat', 'depth'],
        preprocess=kerchunk.combine.drop("tau"),
        out=out).translate()
out.flush()

In [None]:
fs_write = fsspec.filesystem('s3', profile='osn-esip', skip_instance_cache=True, use_listings_cache=False,
                             client_kwargs={'endpoint_url': 'https://ncsa.osn.xsede.org'})

In [None]:
combined_parquet_aws = 's3://esip/rsignell/hycom.parq'

In [None]:
fs_write.rm(combined_parquet_aws, recursive=True)    # delete any existing refs on OSN

In [None]:
_ = fs_write.upload(combined_parquet, combined_parquet_aws, recursive=True)  # upload refs to OSN

Check to make sure the references got updated

In [None]:
fs_write.info(f'{combined_parquet_aws}/.zmetadata')

#### Open the references for the entire dataset

Target options (for accessing the reference files from OSN)

In [None]:
to = dict(anon=True, skip_instance_cache=True, 
          client_kwargs={'endpoint_url': 'https://ncsa.osn.xsede.org'})

In [None]:
ds = xr.open_dataset(combined_parquet_aws, engine='kerchunk', chunks={},
                    backend_kwargs=dict(storage_options=dict(target_options=to,
                    remote_protocol='s3', lazy=True, remote_options=so)))

In [None]:
ds