In [2]:
import xarray as xr 
from pathlib import Path

In [3]:
work_dir = Path('/u/aurora-r0/govorcin/01_OPERA/TROPO/interface/data')

In [19]:
ds = xr.open_dataset(work_dir / 'ECMWF_TROP_202402151200_202402151200_1.nc',  
                     chunks={},
                     mask_and_scale=True)

In [17]:
chunksizes = [{key: value[0]} for key, value in ds.chunksizes.items()]
chunksizes

[{'time': 1}, {'level': 28}, {'latitude': 512}, {'longitude': 1024}]

In [148]:
import logging
logger = logging.getLogger(__name__)

In [164]:
EXPECTED_COORDS = frozenset(['longitude', 'latitude', 'level', 'time'])
EXPECTED_VARS = frozenset(['z', 't', 'q', 'lnsp'])

# Valid range with a buffer
VALID_RANGE = {
    't': [160, 350],       # Temperature (K)
    'q': [1e-10, 0.05],    # Specific humidity (kg/kg)
    'z': [-5000, 65000],   # Geopotential (m²/s²)
    'lnsp': [10, 12]       # Log of surface pressure (unitless)
}


def check_input(ds : xr.Dataset)->None:
   logger.info("Performing checkup of input file") 
   checks = []

   # Check Coordinates
   coords = set(ds.coords.keys())
   if coords != EXPECTED_COORDS:
      missing_coords = EXPECTED_COORDS - coords
      extra_coords = coords - EXPECTED_COORDS
      checks.append(f'Unexpected coordinates. Missing: {missing_coords}, Extra: {extra_coords}')

   if (ds.latitude.min() < -90) |  (ds.latitude.max() > 90):
      checks.append('Latitude values must be within (-90, 90)')

   if (ds.longitude.min() < 0) | (ds.longitude.max() > 360):
      checks.append('Longitude values must be within (0, 360)')

   if (ds.level.min() < 0) |  (ds.level.max() > 137):
      checks.append('Level values must be within (0, 137)')

   # Check Data Variables
   # Check Data Variables
   data_vars = set(ds.data_vars.keys())
   if data_vars != EXPECTED_VARS:
      missing_vars = EXPECTED_VARS - data_vars
      extra_vars = data_vars - EXPECTED_VARS
      checks.append(f'Unexpected data variables. Missing: {missing_vars}, Extra: {extra_vars}')

   # Check NaN values and valid range
   for var in EXPECTED_VARS:
      var_data = ds[var].isel(time=0, level=0 if var in ['z', 'lnsp'] else slice(None))
      var_name = getattr(ds[var], 'long_name', var)

      if var_data.isnull().any():
         checks.append(f'Data Variable "{var}" ({var_name}) contains NaN values.')

      min_val, max_val = var_data.min().values, var_data.max().values
      valid_min, valid_max = VALID_RANGE[var]

      # NOTE: make second part wrong intentionally to check how reporting works
      if  (min_val < valid_min) | (max_val < valid_max):
         checks.append((f'Data Variable "{var}" ({var_name}) is out of valid range' 
                        f' {VALID_RANGE[var]}. Found min: {min_val}, max: {max_val}'))

   # Raise error if any check fails
   if checks:
      for check in checks:
        logger.error(check)
      #raise ValueError(f'Failed validation checks:\n' + "\n".join(checks))

In [131]:
from dask.distributed import Client

client = Client(
    n_workers=4,
    threads_per_worker=2,
    memory_limit=f"{4}GB",
    )

In [133]:
client.dashboard_link

'http://127.0.0.1:8787/status'

In [163]:
%%time
check_input(ds)

Data Variable "z" (Geopotential) is out of valid range [-5000, 65000]. Found min: -4563.2822265625, max: 62700.71875
Data Variable "t" (Temperature) is out of valid range [160, 350]. Found min: 176.68382263183594, max: 314.7515869140625
Data Variable "lnsp" (Logarithm of surface pressure) is out of valid range [10, 12]. Found min: 10.734353065490723, max: 11.577614784240723
Data Variable "q" (Specific humidity) is out of valid range [1e-10, 0.05]. Found min: 2.2825387802072328e-08, max: 0.024700453504920006


CPU times: user 1min 7s, sys: 43 s, total: 1min 50s
Wall time: 1min 40s
