# Complex real-world dataset

This example demonstrates a realistic ocean model output schema with multiple variables at different resolutions, detailed metadata, and CF Convention compliance, in both eager and lazy modes.

This schema validates a global ocean model dataset with:

- **Surface Variables (2D + time)**
  - **sst**: Sea surface temperature
  - **sss**: Sea surface salinity
  - **ice_concentration**: Sea ice concentration
- **3D Variables (depth + time)**
  - **u_velocity**: Eastward ocean current component
  - **v_velocity**: Northward ocean current component
- **Coordinates**
  - **time**: Monthly data for one year (12 months)
  - **lat**: Latitude at 1째 resolution (180 points)
  - **lon**: Longitude at 1째 resolution (360 points)
  - **depth**: 50 vertical levels from surface to 7000 m

In [None]:
# Basic imports
import numpy as np
import xarray as xr

from xarray_validate import DatasetSchema, SchemaError

# Load schema from YAML file
schema = DatasetSchema.from_yaml("schema.yaml")

In [None]:
print("Creating a complex ocean model dataset...\n")

# Create coordinates
n_time = 12
time = np.arange(n_time, dtype=np.int64)  # One timestamp per month
n_lat = 180
lat = np.linspace(-89.5, 89.5, n_lat).astype(np.float32)  # 1째 resolution
n_lon = 360
lon = np.linspace(-179.5, 179.5, n_lon).astype(np.float32)  # 1째 resolution
n_depth = 50
depth = np.array(
    [
        0,
        5,
        10,
        15,
        20,
        25,
        30,
        40,
        50,
        75,
        100,
        125,
        150,
        200,
        250,
        300,
        400,
        500,
        600,
        700,
        800,
        900,
        1000,
        1100,
        1200,
        1300,
        1400,
        1500,
        1750,
        2000,
        2250,
        2500,
        2750,
        3000,
        3250,
        3500,
        3750,
        4000,
        4250,
        4500,
        4750,
        5000,
        5250,
        5500,
        5750,
        6000,
        6250,
        6500,
        6750,
        7000,
    ],
    dtype=np.float32,
)

# Create a Dataset that matches the schema
# Note: Using small random data for demonstration
# In reality, this would be real model output
ds = xr.Dataset(
    data_vars={
        # Surface fields (2D + time)
        "sst": (
            ["time", "lat", "lon"],
            15.0 + 10.0 * np.random.randn(n_time, n_lat, n_lon).astype(np.float32),
            {
                "long_name": "Sea Surface Temperature",
                "units": "degrees_C",
                "valid_min": -2.0,
                "valid_max": 35.0,
                "standard_name": "sea_surface_temperature",
            },
        ),
        "sss": (
            ["time", "lat", "lon"],
            35.0 + 2.0 * np.random.randn(n_time, n_lat, n_lon).astype(np.float32),
            {
                "long_name": "Sea Surface Salinity",
                "units": "psu",
                "valid_min": 0.0,
                "valid_max": 42.0,
                "standard_name": "sea_surface_salinity",
            },
        ),
        "ice_concentration": (
            ["time", "lat", "lon"],
            np.clip(np.random.rand(n_time, n_lat, n_lon), 0, 1).astype(np.float32),
            {
                "long_name": "Sea Ice Concentration",
                "units": "1",
                "valid_min": 0.0,
                "valid_max": 1.0,
            },
        ),
        # 3D fields (depth + time)
        "u_velocity": (
            ["time", "depth", "lat", "lon"],
            0.1 * np.random.randn(n_time, n_depth, n_lat, n_lon).astype(np.float32),
            {
                "long_name": "Eastward Current Velocity",
                "units": "m/s",
                "standard_name": "eastward_sea_water_velocity",
            },
        ),
        "v_velocity": (
            ["time", "depth", "lat", "lon"],
            0.1 * np.random.randn(n_time, n_depth, n_lat, n_lon).astype(np.float32),
            {
                "long_name": "Northward Current Velocity",
                "units": "m/s",
                "standard_name": "northward_sea_water_velocity",
            },
        ),
    },
    coords={
        "time": (
            "time",
            time,
            {
                "long_name": "Time",
                "units": "days since 2024-01-01",
                "calendar": "gregorian",
            },
        ),
        "lat": (
            "lat",
            lat,
            {
                "long_name": "Latitude",
                "units": "degrees_north",
                "valid_min": -90.0,
                "valid_max": 90.0,
                "standard_name": "latitude",
            },
        ),
        "lon": (
            "lon",
            lon,
            {
                "long_name": "Longitude",
                "units": "degrees_east",
                "valid_min": -180.0,
                "valid_max": 180.0,
                "standard_name": "longitude",
            },
        ),
        "depth": (
            "depth",
            depth,
            {
                "long_name": "Depth",
                "units": "meters",
                "positive": "down",
                "standard_name": "depth",
            },
        ),
    },
    attrs={
        "title": "Global Ocean Model Output",
        "institution": "Ocean Modeling Center",
        "source": "OCEAN-MODEL v2.0",
        "Conventions": "CF-1.8",
        "history": "Created on 2024-01-01",
        "references": "https://example.org/ocean-model",
        "comment": "Daily output from global ocean model simulation",
    },
)

print("Dataset created successfully!")
print(f"Dataset size: {ds.nbytes / 1e9:.2f} GB")
print(f"Variables: {list(ds.data_vars)}")
print(f"Coordinates: {list(ds.coords)}\n")

In [None]:
# Validate the Dataset in eager mode
try:
    result = schema.validate(ds, mode="eager")
except SchemaError as e:
    print(f"Validation failed: {e}\n")

In [None]:
# Demonstrate lazy validation with multiple errors

# Create a dataset with multiple intentional errors
ds_with_errors = ds.copy()
# Error 1: Wrong dtype for sst
ds_with_errors["sst"] = ds_with_errors["sst"].astype(np.float64)
# Error 2: Remove required attribute from sss
del ds_with_errors["sss"].attrs["units"]
# Error 3: Remove required global attribute
del ds_with_errors.attrs["Conventions"]

# Use lazy validation to collect all errors
result = schema.validate(ds_with_errors, mode="lazy")

if result.has_errors:
    print(f"Found {len(result.errors)} validation errors:\n")
    for i, (path, error) in enumerate(result.errors, 1):
        print(f"{i}. {path}: {error}")
else:
    print("No errors found")