Merge pull request #2524 from ghiggi/feature-cf-dataset

Refactor CFWriter utility into CF directory
pytroll · Nov 28, 2023 · 72e2a71 · 72e2a71
2 parents ed220be + b4e8fa5
commit 72e2a71
Show file tree

Hide file tree

Showing 26 changed files with 2,258 additions and 1,935 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -13,4 +13,5 @@ include satpy/version.py
 include pyproject.toml
 include setup.py
 include setup.cfg
+include satpy/py.typed
 global-exclude *.py[cod]
diff --git a/satpy/_scene_converters.py b/satpy/_scene_converters.py
@@ -52,48 +52,35 @@ def to_xarray(scn,
     If Scene DataArrays are on different areas, currently it fails, although
     in future we might return a DataTree object, grouped by area.
 
-    Parameters
-    ----------
-    scn: satpy.Scene
-        Satpy Scene.
-    datasets (iterable):
-        List of Satpy Scene datasets to include in the output xr.Dataset.
-        Elements can be string name, a wavelength as a number, a DataID,
-        or DataQuery object.
-        If None (the default), it include all loaded Scene datasets.
-    header_attrs:
-        Global attributes of the output xr.Dataset.
-    epoch (str):
-        Reference time for encoding the time coordinates (if available).
-        Example format: "seconds since 1970-01-01 00:00:00".
-        If None, the default reference time is retrieved using "from satpy.cf_writer import EPOCH"
-    flatten_attrs (bool):
-        If True, flatten dict-type attributes.
-    exclude_attrs (list):
-        List of xr.DataArray attribute names to be excluded.
-    include_lonlats (bool):
-        If True, it includes 'latitude' and 'longitude' coordinates.
-        If the 'area' attribute is a SwathDefinition, it always includes
-        latitude and longitude coordinates.
-    pretty (bool):
-        Don't modify coordinate names, if possible. Makes the file prettier,
-        but possibly less consistent.
-    include_orig_name (bool).
-        Include the original dataset name as a variable attribute in the xr.Dataset.
-    numeric_name_prefix (str):
-        Prefix to add the each variable with name starting with a digit.
-        Use '' or None to leave this out.
+    Args:
+        scn (satpy.Scene): Satpy Scene.
+        datasets (iterable, optional): List of Satpy Scene datasets to include in
+            the output xr.Dataset. Elements can be string name, a wavelength as a
+            number, a DataID, or DataQuery object. If None (the default), it
+            includes all loaded Scene datasets.
+        header_attrs: Global attributes of the output xr.Dataset.
+        epoch (str, optional): Reference time for encoding the time coordinates
+            (if available). Format example: "seconds since 1970-01-01 00:00:00".
+            If None, the default reference time is retrieved using
+            "from satpy.cf_writer import EPOCH".
+        flatten_attrs (bool, optional): If True, flatten dict-type attributes.
+        exclude_attrs (list, optional): List of xr.DataArray attribute names to
+            be excluded.
+        include_lonlats (bool, optional): If True, includes 'latitude' and
+            'longitude' coordinates. If the 'area' attribute is a SwathDefinition,
+            it always includes latitude and longitude coordinates.
+        pretty (bool, optional): Don't modify coordinate names, if possible. Makes
+            the file prettier, but possibly less consistent.
+        include_orig_name (bool, optional): Include the original dataset name as a
+            variable attribute in the xr.Dataset.
+        numeric_name_prefix (str, optional): Prefix to add to each variable with
+            name starting with a digit. Use '' or None to leave this out.
 
     Returns:
-    -------
-    ds, xr.Dataset
-        A CF-compliant xr.Dataset
+        xr.Dataset: A CF-compliant xr.Dataset
 
     """
-    from satpy.writers.cf_writer import EPOCH, collect_cf_datasets
-
-    if epoch is None:
-        epoch = EPOCH
+    from satpy.cf.datasets import collect_cf_datasets
 
     # Get list of DataArrays
     if datasets is None:

diff --git a/satpy/writers/cf/__init__.py → satpy/cf/__init__.py b/satpy/writers/cf/__init__.py → satpy/cf/__init__.py
@@ -1,3 +1 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """Code for generation of CF-compliant datasets."""
diff --git a/satpy/cf/area.py b/satpy/cf/area.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2017-2023 Satpy developers
+#
+# This file is part of satpy.
+#
+# satpy is free software: you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# satpy is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# satpy.  If not, see <http://www.gnu.org/licenses/>.
+"""CF processing of pyresample area information."""
+import logging
+
+import xarray as xr
+from packaging.version import Version
+from pyresample.geometry import AreaDefinition, SwathDefinition
+
+logger = logging.getLogger(__name__)
+
+
+def _add_lonlat_coords(data_arr: xr.DataArray) -> xr.DataArray:
+    """Add 'longitude' and 'latitude' coordinates to DataArray."""
+    data_arr = data_arr.copy()
+    area = data_arr.attrs["area"]
+    ignore_dims = {dim: 0 for dim in data_arr.dims if dim not in ["x", "y"]}
+    chunks = getattr(data_arr.isel(**ignore_dims), "chunks", None)
+    lons, lats = area.get_lonlats(chunks=chunks)
+    data_arr["longitude"] = xr.DataArray(lons, dims=["y", "x"],
+                                         attrs={"name": "longitude",
+                                                 "standard_name": "longitude",
+                                                 "units": "degrees_east"},
+                                         name="longitude")
+    data_arr["latitude"] = xr.DataArray(lats, dims=["y", "x"],
+                                        attrs={"name": "latitude",
+                                                "standard_name": "latitude",
+                                                "units": "degrees_north"},
+                                        name="latitude")
+    return data_arr
+
+
+def _create_grid_mapping(area):
+    """Create the grid mapping instance for `area`."""
+    import pyproj
+
+    if Version(pyproj.__version__) < Version("2.4.1"):
+        # technically 2.2, but important bug fixes in 2.4.1
+        raise ImportError("'cf' writer requires pyproj 2.4.1 or greater")
+    # let pyproj do the heavily lifting (pyproj 2.0+ required)
+    grid_mapping = area.crs.to_cf()
+    return area.area_id, grid_mapping
+
+
+def _add_grid_mapping(data_arr: xr.DataArray) -> tuple[xr.DataArray, xr.DataArray]:
+    """Convert an area to at CF grid mapping."""
+    data_arr = data_arr.copy()
+    area = data_arr.attrs["area"]
+    gmapping_var_name, attrs = _create_grid_mapping(area)
+    data_arr.attrs["grid_mapping"] = gmapping_var_name
+    return data_arr, xr.DataArray(0, attrs=attrs, name=gmapping_var_name)
+
+
+def area2cf(data_arr: xr.DataArray, include_lonlats: bool = False, got_lonlats: bool = False) -> list[xr.DataArray]:
+    """Convert an area to at CF grid mapping or lon and lats."""
+    res = []
+    include_lonlats = include_lonlats or isinstance(data_arr.attrs["area"], SwathDefinition)
+    is_area_def = isinstance(data_arr.attrs["area"], AreaDefinition)
+    if not got_lonlats and include_lonlats:
+        data_arr = _add_lonlat_coords(data_arr)
+    if is_area_def:
+        data_arr, gmapping = _add_grid_mapping(data_arr)
+        res.append(gmapping)
+    res.append(data_arr)
+    return res
diff --git a/satpy/cf/attrs.py b/satpy/cf/attrs.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2017-2023 Satpy developers
+#
+# This file is part of satpy.
+#
+# satpy is free software: you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# satpy is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# satpy.  If not, see <http://www.gnu.org/licenses/>.
+"""CF processing of attributes."""
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+from collections import OrderedDict
+
+import numpy as np
+import xarray as xr
+
+from satpy.writers.utils import flatten_dict
+
+logger = logging.getLogger(__name__)
+
+
+class AttributeEncoder(json.JSONEncoder):
+    """JSON encoder for dataset attributes."""
+
+    def default(self, obj):
+        """Return a json-serializable object for *obj*.
+
+        In order to facilitate decoding, elements in dictionaries, lists/tuples and multi-dimensional arrays are
+        encoded recursively.
+        """
+        if isinstance(obj, dict):
+            serialized = {}
+            for key, val in obj.items():
+                serialized[key] = self.default(val)
+            return serialized
+        elif isinstance(obj, (list, tuple, np.ndarray)):
+            return [self.default(item) for item in obj]
+        return self._encode(obj)
+
+    def _encode(self, obj):
+        """Encode the given object as a json-serializable datatype."""
+        if isinstance(obj, (bool, np.bool_)):
+            # Bool has to be checked first, because it is a subclass of int
+            return str(obj).lower()
+        elif isinstance(obj, (int, float, str)):
+            return obj
+        elif isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.void):
+            return tuple(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return str(obj)
+
+
+def _encode_numpy_array(obj):
+    """Encode numpy array as a netCDF4 serializable datatype."""
+    from satpy.writers.cf_writer import NC4_DTYPES
+
+    # Only plain 1-d arrays are supported. Skip record arrays and multi-dimensional arrays.
+    is_plain_1d = not obj.dtype.fields and len(obj.shape) <= 1
+    if not is_plain_1d:
+        raise ValueError("Only a 1D numpy array can be encoded as netCDF attribute.")
+    if obj.dtype in NC4_DTYPES:
+        return obj
+    if obj.dtype == np.bool_:
+        # Boolean arrays are not supported, convert to array of strings.
+        return [s.lower() for s in obj.astype(str)]
+    return obj.tolist()
+
+
+def _encode_object(obj):
+    """Try to encode `obj` as a netCDF/Zarr compatible datatype which most closely resembles the object's nature.
+
+    Raises:
+        ValueError if no such datatype could be found
+    """
+    is_nonbool_int = isinstance(obj, int) and not isinstance(obj, (bool, np.bool_))
+    is_encode_type = isinstance(obj, (float, str, np.integer, np.floating))
+    if is_nonbool_int or is_encode_type:
+        return obj
+    elif isinstance(obj, np.ndarray):
+        return _encode_numpy_array(obj)
+    raise ValueError("Unable to encode")
+
+
+def _try_decode_object(obj):
+    """Try to decode byte string."""
+    try:
+        decoded = obj.decode()
+    except AttributeError:
+        decoded = obj
+    return decoded
+
+
+def _encode_python_objects(obj):
+    """Try to find the datatype which most closely resembles the object's nature.
+
+    If on failure, encode as a string. Plain lists are encoded recursively.
+    """
+    if isinstance(obj, (list, tuple)) and all([not isinstance(item, (list, tuple)) for item in obj]):
+        return [_encode_to_cf(item) for item in obj]
+    try:
+        dump = _encode_object(obj)
+    except ValueError:
+        decoded = _try_decode_object(obj)
+        dump = json.dumps(decoded, cls=AttributeEncoder).strip('"')
+    return dump
+
+
+def _encode_to_cf(obj):
+    """Encode the given object as a netcdf compatible datatype."""
+    try:
+        return obj.to_cf()
+    except AttributeError:
+        return _encode_python_objects(obj)
+
+
+def _encode_nc_attrs(attrs):
+    """Encode dataset attributes in a netcdf compatible datatype.
+
+    Args:
+        attrs (dict):
+            Attributes to be encoded
+    Returns:
+        dict: Encoded (and sorted) attributes
+
+    """
+    encoded_attrs = []
+    for key, val in sorted(attrs.items()):
+        if val is not None:
+            encoded_attrs.append((key, _encode_to_cf(val)))
+    return OrderedDict(encoded_attrs)
+
+
+def preprocess_attrs(
+        data_arr: xr.DataArray,
+        flatten_attrs: bool,
+        exclude_attrs: list[str] | None
+) -> xr.DataArray:
+    """Preprocess DataArray attributes to be written into CF-compliant netCDF/Zarr."""
+    _drop_attrs(data_arr, exclude_attrs)
+    _add_ancillary_variables_attrs(data_arr)
+    _format_prerequisites_attrs(data_arr)
+
+    if "long_name" not in data_arr.attrs and "standard_name" not in data_arr.attrs:
+        data_arr.attrs["long_name"] = data_arr.name
+
+    if flatten_attrs:
+        data_arr.attrs = flatten_dict(data_arr.attrs)
+
+    data_arr.attrs = _encode_nc_attrs(data_arr.attrs)
+
+    return data_arr
+
+
+def _drop_attrs(
+        data_arr: xr.DataArray,
+        user_excluded_attrs: list[str] | None
+) -> None:
+    """Remove undesirable attributes."""
+    attrs_to_drop = (
+            (user_excluded_attrs or []) +
+            _get_satpy_attrs(data_arr) +
+            _get_none_attrs(data_arr) +
+            ["area"]
+    )
+    for key in attrs_to_drop:
+        data_arr.attrs.pop(key, None)
+
+
+def _get_satpy_attrs(data_arr: xr.DataArray) -> list[str]:
+    """Remove _satpy attribute."""
+    return [key for key in data_arr.attrs if key.startswith("_satpy")] + ["_last_resampler"]
+
+
+def _get_none_attrs(data_arr: xr.DataArray) -> list[str]:
+    """Remove attribute keys with None value."""
+    return [attr_name for attr_name, attr_val in data_arr.attrs.items() if attr_val is None]
+
+
+def _add_ancillary_variables_attrs(data_arr: xr.DataArray) -> None:
+    """Replace ancillary_variables DataArray with a list of their name."""
+    list_ancillary_variable_names = [da_ancillary.attrs["name"]
+                                     for da_ancillary in data_arr.attrs.get("ancillary_variables", [])]
+    if list_ancillary_variable_names:
+        data_arr.attrs["ancillary_variables"] = " ".join(list_ancillary_variable_names)
+    else:
+        data_arr.attrs.pop("ancillary_variables", None)
+
+
+def _format_prerequisites_attrs(data_arr: xr.DataArray) -> None:
+    """Reformat prerequisites attribute value to string."""
+    if "prerequisites" in data_arr.attrs:
+        data_arr.attrs["prerequisites"] = [np.bytes_(str(prereq)) for prereq in data_arr.attrs["prerequisites"]]
+
+
+def _add_history(attrs):
+    """Add 'history' attribute to dictionary."""
+    _history_create = "Created by pytroll/satpy on {}".format(datetime.datetime.utcnow())
+    if "history" in attrs:
+        if isinstance(attrs["history"], list):
+            attrs["history"] = "".join(attrs["history"])
+        attrs["history"] += "\n" + _history_create
+    else:
+        attrs["history"] = _history_create
+    return attrs
+
+
+def preprocess_header_attrs(header_attrs, flatten_attrs=False):
+    """Prepare file header attributes."""
+    if header_attrs is not None:
+        if flatten_attrs:
+            header_attrs = flatten_dict(header_attrs)
+        header_attrs = _encode_nc_attrs(header_attrs)  # OrderedDict
+    else:
+        header_attrs = {}
+    header_attrs = _add_history(header_attrs)
+    return header_attrs