# Convert NWB Zarr to NetCDF
Convert the National Water Balance Zarr file to NetCDF, specifying the encoding and adding some metadata

In [None]:
import xarray as xr
import numpy as np
import hvplot.xarray
import fsspec

In [None]:
#skip to netcdf creation
cluster.scale(1)

In [None]:
# for computing scale_factor and add_offset
cluster.scale(30)

In [None]:
from dask.distributed import Client
client = Client(n_workers=1)

In [None]:
cluster

In [None]:
client

In [None]:
fs = fsspec.filesystem('s3', profile='esip-qhub')

In [None]:
#nc_out = fs.open('s3://esip-qhub/usgs/nbm.nc',mode='w')
nc_out = 'nbm.nc'

In [None]:
mapper = fs.get_mapper('s3:///esip-qhub/usgs/nbm.zarr')

In [None]:
ds = xr.open_dataset(mapper, engine='zarr', chunks={})

#### Specify input Zarr dataset and NetCDF to be created:

In [None]:
#nc_out = '/home/jovyan/WBM/Climgrid_wbm.nc'

In [None]:
cluster.scale(30)

In [None]:
def compute_scale_and_offset(da, n=16):
    """Calculate offset and scale factor for int conversion

    Based on Krios101's code above.r
    """

    vmin = float(da.min().values)
    vmax = float(da.max().values)

    # stretch/compress data to the available packed range
    scale_factor = (vmax - vmin) / (2 ** n - 1)

    # translate the range to be symmetric about zero
    add_offset = vmin + 2 ** (n - 1) * scale_factor

    return scale_factor, add_offset

In [None]:
client

In [None]:
%%time
scale_factor = {}
add_offset = {}
for var in ds.data_vars:    
    scale_factor[var], add_offset[var] = compute_scale_and_offset(ds[var])

#### Specify variable encoding
Here we specify compression and other encoding to match the zarr file

Take a look at a zarr variable encoding:

In [None]:
ds.tmean.encoding

Specify the encoding for all NetCDF variables, using zlib compression and same chunk sizes as the Zarr dataset:

In [None]:
encoding={}
for var in ds.variables:
    encoding[var] = dict(zlib=True, complevel=5, 
                         fletcher32=False, shuffle=False,
                         chunksizes=ds[var].encoding['chunks'],
                        )

In [None]:
for var in ds.data_vars:
    encoding[var]['scale_factor']=scale_factor[var]
    encoding[var]['add_offset']=add_offset[var]
    encoding[var]['dtype']='int16'
    encoding[var]['_FillValue']=-32767

In [None]:
encoding

In [None]:
encoding = {'aet': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.0031155336600760804,
  'add_offset': 102.089806973373,
  'dtype': 'int16',
  '_FillValue': -32767},
 'lat': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (596,)},
 'lon': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (1385,)},
 'pet': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.005908711038247357,
  'add_offset': 195.65293728474887,
  'dtype': 'int16',
  '_FillValue': -32767},
 'prcp': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.03998992889653429,
  'add_offset': 1310.3899900816357,
  'dtype': 'int16',
  '_FillValue': -32767},
 'rain': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.021082069174033914,
  'add_offset': 690.8172426947433,
  'dtype': 'int16',
  '_FillValue': -32767},
 'runoff': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.012947468209377623,
  'add_offset': 424.26263828488595,
  'dtype': 'int16',
  '_FillValue': -32767},
 'snow': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.03998992889653429,
  'add_offset': 1310.3899900816357,
  'dtype': 'int16',
  '_FillValue': -32767},
 'soilstorage': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.008209353780422675,
  'add_offset': 269.0041046768902,
  'dtype': 'int16',
  '_FillValue': -32767},
 'swe': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 9.748828870069428,
  'add_offset': 319449.624414435,
  'dtype': 'int16',
  '_FillValue': -32767},
 'time': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (1512,)},
 'tmean': {'zlib': True,
  'complevel': 5,
  'fletcher32': False,
  'shuffle': False,
  'chunksizes': (120, 300, 700),
  'scale_factor': 0.0010586709554911914,
  'add_offset': 5.85052971694747,
  'dtype': 'int16',
  '_FillValue': -32767}}

#### Specify metadata
Ideally we would specify at least the Highly Recommended attributes from the [ACDD Conventions](https://wiki.esipfed.org/Attribute_Convention_for_Data_Discovery_1-3).  

Standard_names are from the [CF standard_name list](https://cfconventions.org/Data/cf-standard-names/77/build/cf-standard-name-table.html)

In [None]:
ds.attrs['title'] = 'USGS Water Balance Model for CONUS (1895-2020)'
ds.attrs['Conventions'] = "CF-1.7"
ds.attrs['summary'] = 'These data represent the monthly inputs and outputs from a United States Geological Survey water-balance for the conterminous United States for the period 01-01-1895 to 12-31-2020'
ds.attrs['keywords'] = ('water balance model, conterminous United States, temperature, precipitation,'
' snow fall, actual evapotranspiration, potential evapotranspiration, snow water equivalent, soil moisture storage,'
' surface water runoff')
ds.attrs['source'] = 'Climgrid outputs from McCabe and Wolock water balance model,  DOI: https://doi.org/10.1029/2011WR010630'
ds.attrs['comment'] = ('Although the monthly water balance model output starts in 1895,'
'the output for 1895 through 1899 should be discarded. During this period the model is spinning up and working through'
'initial model conditions.')

ds.time.attrs['standard_name'] = 'time'
encoding['time']['units']=  "days since 1858-11-17 00:00:00"

ds.lon.attrs['units'] = 'degrees_east'
ds.lon.attrs['standard_name'] = 'longitude'

ds.lat.attrs['units'] = 'degrees_north'
ds.lat.attrs['standard_name'] = 'latitude'

ds.tmean.attrs['units'] = 'degC'
ds.tmean.attrs['standard_name'] = 'temperature'

ds.prcp.attrs['units'] = 'mm'
ds.prcp.attrs['standard_name'] = 'precipitation_amount'
ds.prcp.attrs['long_name'] = 'total precipitation amount including snow and rain'

ds.rain.attrs['units'] = 'mm'
ds.rain.attrs['standard_name'] = 'precipitation_amount'
ds.rain.attrs['long_name'] = 'total precipitation amount for rain'

ds.runoff.attrs['units'] = 'mm'
ds.runoff.attrs['standard_name'] = 'runoff_amount'
ds.runoff.attrs['long_name'] = 'streamflow per unit area'

ds.snow.attrs['units'] = 'mm'
ds.snow.attrs['standard_name'] = 'snowfall_amount'

ds.swe.attrs['units'] = 'mm'
ds.swe.attrs['standard_name'] = 'liquid_water_content_of_surface_snow'
ds.swe.attrs['long_name'] = 'liquid water equivalent in the snow pack'

ds.soilstorage.attrs['units'] = 'mm'
ds.soilstorage.attrs['standard_name'] = 'liquid_water_content_of_soil_layer'

ds.aet.attrs['units'] = 'mm'
ds.aet.attrs['standard_name'] = 'water_actual_evaporation_amount'
ds.aet.attrs['long_name'] = 'actual evapotranspiration'

ds.pet.attrs['units'] = 'mm'
ds.pet.attrs['standard_name'] = 'water_potential_evaporation_amount'
ds.pet.attrs['long_name'] = 'potential evapotranspiration'

In [None]:
cluster.scale(1)

#### Write the NetCDF file

In [None]:
%%time
ds.to_netcdf(nc_out, mode='w', encoding=encoding, engine='netcdf4')

In [None]:
import dask

In [None]:
dask.__version__