Skip to content

Commit

Permalink
Merge pull request #2524 from ghiggi/feature-cf-dataset
Browse files Browse the repository at this point in the history
Refactor CFWriter utility into CF directory
  • Loading branch information
mraspaud committed Nov 28, 2023
2 parents ed220be + b4e8fa5 commit 72e2a71
Show file tree
Hide file tree
Showing 26 changed files with 2,258 additions and 1,935 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ include satpy/version.py
include pyproject.toml
include setup.py
include setup.cfg
include satpy/py.typed
global-exclude *.py[cod]
63 changes: 25 additions & 38 deletions satpy/_scene_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,48 +52,35 @@ def to_xarray(scn,
If Scene DataArrays are on different areas, currently it fails, although
in future we might return a DataTree object, grouped by area.
Parameters
----------
scn: satpy.Scene
Satpy Scene.
datasets (iterable):
List of Satpy Scene datasets to include in the output xr.Dataset.
Elements can be string name, a wavelength as a number, a DataID,
or DataQuery object.
If None (the default), it include all loaded Scene datasets.
header_attrs:
Global attributes of the output xr.Dataset.
epoch (str):
Reference time for encoding the time coordinates (if available).
Example format: "seconds since 1970-01-01 00:00:00".
If None, the default reference time is retrieved using "from satpy.cf_writer import EPOCH"
flatten_attrs (bool):
If True, flatten dict-type attributes.
exclude_attrs (list):
List of xr.DataArray attribute names to be excluded.
include_lonlats (bool):
If True, it includes 'latitude' and 'longitude' coordinates.
If the 'area' attribute is a SwathDefinition, it always includes
latitude and longitude coordinates.
pretty (bool):
Don't modify coordinate names, if possible. Makes the file prettier,
but possibly less consistent.
include_orig_name (bool).
Include the original dataset name as a variable attribute in the xr.Dataset.
numeric_name_prefix (str):
Prefix to add the each variable with name starting with a digit.
Use '' or None to leave this out.
Args:
scn (satpy.Scene): Satpy Scene.
datasets (iterable, optional): List of Satpy Scene datasets to include in
the output xr.Dataset. Elements can be string name, a wavelength as a
number, a DataID, or DataQuery object. If None (the default), it
includes all loaded Scene datasets.
header_attrs: Global attributes of the output xr.Dataset.
epoch (str, optional): Reference time for encoding the time coordinates
(if available). Format example: "seconds since 1970-01-01 00:00:00".
If None, the default reference time is retrieved using
"from satpy.cf_writer import EPOCH".
flatten_attrs (bool, optional): If True, flatten dict-type attributes.
exclude_attrs (list, optional): List of xr.DataArray attribute names to
be excluded.
include_lonlats (bool, optional): If True, includes 'latitude' and
'longitude' coordinates. If the 'area' attribute is a SwathDefinition,
it always includes latitude and longitude coordinates.
pretty (bool, optional): Don't modify coordinate names, if possible. Makes
the file prettier, but possibly less consistent.
include_orig_name (bool, optional): Include the original dataset name as a
variable attribute in the xr.Dataset.
numeric_name_prefix (str, optional): Prefix to add to each variable with
name starting with a digit. Use '' or None to leave this out.
Returns:
-------
ds, xr.Dataset
A CF-compliant xr.Dataset
xr.Dataset: A CF-compliant xr.Dataset
"""
from satpy.writers.cf_writer import EPOCH, collect_cf_datasets

if epoch is None:
epoch = EPOCH
from satpy.cf.datasets import collect_cf_datasets

# Get list of DataArrays
if datasets is None:
Expand Down
2 changes: 0 additions & 2 deletions satpy/writers/cf/__init__.py → satpy/cf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Code for generation of CF-compliant datasets."""
78 changes: 78 additions & 0 deletions satpy/cf/area.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) 2017-2023 Satpy developers
#
# This file is part of satpy.
#
# satpy is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# satpy is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# satpy. If not, see <http://www.gnu.org/licenses/>.
"""CF processing of pyresample area information."""
import logging

import xarray as xr
from packaging.version import Version
from pyresample.geometry import AreaDefinition, SwathDefinition

logger = logging.getLogger(__name__)


def _add_lonlat_coords(data_arr: xr.DataArray) -> xr.DataArray:
"""Add 'longitude' and 'latitude' coordinates to DataArray."""
data_arr = data_arr.copy()
area = data_arr.attrs["area"]
ignore_dims = {dim: 0 for dim in data_arr.dims if dim not in ["x", "y"]}
chunks = getattr(data_arr.isel(**ignore_dims), "chunks", None)
lons, lats = area.get_lonlats(chunks=chunks)
data_arr["longitude"] = xr.DataArray(lons, dims=["y", "x"],
attrs={"name": "longitude",
"standard_name": "longitude",
"units": "degrees_east"},
name="longitude")
data_arr["latitude"] = xr.DataArray(lats, dims=["y", "x"],
attrs={"name": "latitude",
"standard_name": "latitude",
"units": "degrees_north"},
name="latitude")
return data_arr


def _create_grid_mapping(area):
"""Create the grid mapping instance for `area`."""
import pyproj

if Version(pyproj.__version__) < Version("2.4.1"):
# technically 2.2, but important bug fixes in 2.4.1
raise ImportError("'cf' writer requires pyproj 2.4.1 or greater")
# let pyproj do the heavily lifting (pyproj 2.0+ required)
grid_mapping = area.crs.to_cf()
return area.area_id, grid_mapping


def _add_grid_mapping(data_arr: xr.DataArray) -> tuple[xr.DataArray, xr.DataArray]:
"""Convert an area to at CF grid mapping."""
data_arr = data_arr.copy()
area = data_arr.attrs["area"]
gmapping_var_name, attrs = _create_grid_mapping(area)
data_arr.attrs["grid_mapping"] = gmapping_var_name
return data_arr, xr.DataArray(0, attrs=attrs, name=gmapping_var_name)


def area2cf(data_arr: xr.DataArray, include_lonlats: bool = False, got_lonlats: bool = False) -> list[xr.DataArray]:
"""Convert an area to at CF grid mapping or lon and lats."""
res = []
include_lonlats = include_lonlats or isinstance(data_arr.attrs["area"], SwathDefinition)
is_area_def = isinstance(data_arr.attrs["area"], AreaDefinition)
if not got_lonlats and include_lonlats:
data_arr = _add_lonlat_coords(data_arr)
if is_area_def:
data_arr, gmapping = _add_grid_mapping(data_arr)
res.append(gmapping)
res.append(data_arr)
return res
231 changes: 231 additions & 0 deletions satpy/cf/attrs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
# Copyright (c) 2017-2023 Satpy developers
#
# This file is part of satpy.
#
# satpy is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# satpy is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# satpy. If not, see <http://www.gnu.org/licenses/>.
"""CF processing of attributes."""
from __future__ import annotations

import datetime
import json
import logging
from collections import OrderedDict

import numpy as np
import xarray as xr

from satpy.writers.utils import flatten_dict

logger = logging.getLogger(__name__)


class AttributeEncoder(json.JSONEncoder):
"""JSON encoder for dataset attributes."""

def default(self, obj):
"""Return a json-serializable object for *obj*.
In order to facilitate decoding, elements in dictionaries, lists/tuples and multi-dimensional arrays are
encoded recursively.
"""
if isinstance(obj, dict):
serialized = {}
for key, val in obj.items():
serialized[key] = self.default(val)
return serialized
elif isinstance(obj, (list, tuple, np.ndarray)):
return [self.default(item) for item in obj]
return self._encode(obj)

def _encode(self, obj):
"""Encode the given object as a json-serializable datatype."""
if isinstance(obj, (bool, np.bool_)):
# Bool has to be checked first, because it is a subclass of int
return str(obj).lower()
elif isinstance(obj, (int, float, str)):
return obj
elif isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.void):
return tuple(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
return str(obj)


def _encode_numpy_array(obj):
"""Encode numpy array as a netCDF4 serializable datatype."""
from satpy.writers.cf_writer import NC4_DTYPES

# Only plain 1-d arrays are supported. Skip record arrays and multi-dimensional arrays.
is_plain_1d = not obj.dtype.fields and len(obj.shape) <= 1
if not is_plain_1d:
raise ValueError("Only a 1D numpy array can be encoded as netCDF attribute.")
if obj.dtype in NC4_DTYPES:
return obj
if obj.dtype == np.bool_:
# Boolean arrays are not supported, convert to array of strings.
return [s.lower() for s in obj.astype(str)]
return obj.tolist()


def _encode_object(obj):
"""Try to encode `obj` as a netCDF/Zarr compatible datatype which most closely resembles the object's nature.
Raises:
ValueError if no such datatype could be found
"""
is_nonbool_int = isinstance(obj, int) and not isinstance(obj, (bool, np.bool_))
is_encode_type = isinstance(obj, (float, str, np.integer, np.floating))
if is_nonbool_int or is_encode_type:
return obj
elif isinstance(obj, np.ndarray):
return _encode_numpy_array(obj)
raise ValueError("Unable to encode")


def _try_decode_object(obj):
"""Try to decode byte string."""
try:
decoded = obj.decode()
except AttributeError:
decoded = obj
return decoded


def _encode_python_objects(obj):
"""Try to find the datatype which most closely resembles the object's nature.
If on failure, encode as a string. Plain lists are encoded recursively.
"""
if isinstance(obj, (list, tuple)) and all([not isinstance(item, (list, tuple)) for item in obj]):
return [_encode_to_cf(item) for item in obj]
try:
dump = _encode_object(obj)
except ValueError:
decoded = _try_decode_object(obj)
dump = json.dumps(decoded, cls=AttributeEncoder).strip('"')
return dump


def _encode_to_cf(obj):
"""Encode the given object as a netcdf compatible datatype."""
try:
return obj.to_cf()
except AttributeError:
return _encode_python_objects(obj)


def _encode_nc_attrs(attrs):
"""Encode dataset attributes in a netcdf compatible datatype.
Args:
attrs (dict):
Attributes to be encoded
Returns:
dict: Encoded (and sorted) attributes
"""
encoded_attrs = []
for key, val in sorted(attrs.items()):
if val is not None:
encoded_attrs.append((key, _encode_to_cf(val)))
return OrderedDict(encoded_attrs)


def preprocess_attrs(
data_arr: xr.DataArray,
flatten_attrs: bool,
exclude_attrs: list[str] | None
) -> xr.DataArray:
"""Preprocess DataArray attributes to be written into CF-compliant netCDF/Zarr."""
_drop_attrs(data_arr, exclude_attrs)
_add_ancillary_variables_attrs(data_arr)
_format_prerequisites_attrs(data_arr)

if "long_name" not in data_arr.attrs and "standard_name" not in data_arr.attrs:
data_arr.attrs["long_name"] = data_arr.name

if flatten_attrs:
data_arr.attrs = flatten_dict(data_arr.attrs)

data_arr.attrs = _encode_nc_attrs(data_arr.attrs)

return data_arr


def _drop_attrs(
data_arr: xr.DataArray,
user_excluded_attrs: list[str] | None
) -> None:
"""Remove undesirable attributes."""
attrs_to_drop = (
(user_excluded_attrs or []) +
_get_satpy_attrs(data_arr) +
_get_none_attrs(data_arr) +
["area"]
)
for key in attrs_to_drop:
data_arr.attrs.pop(key, None)


def _get_satpy_attrs(data_arr: xr.DataArray) -> list[str]:
"""Remove _satpy attribute."""
return [key for key in data_arr.attrs if key.startswith("_satpy")] + ["_last_resampler"]


def _get_none_attrs(data_arr: xr.DataArray) -> list[str]:
"""Remove attribute keys with None value."""
return [attr_name for attr_name, attr_val in data_arr.attrs.items() if attr_val is None]


def _add_ancillary_variables_attrs(data_arr: xr.DataArray) -> None:
"""Replace ancillary_variables DataArray with a list of their name."""
list_ancillary_variable_names = [da_ancillary.attrs["name"]
for da_ancillary in data_arr.attrs.get("ancillary_variables", [])]
if list_ancillary_variable_names:
data_arr.attrs["ancillary_variables"] = " ".join(list_ancillary_variable_names)
else:
data_arr.attrs.pop("ancillary_variables", None)


def _format_prerequisites_attrs(data_arr: xr.DataArray) -> None:
"""Reformat prerequisites attribute value to string."""
if "prerequisites" in data_arr.attrs:
data_arr.attrs["prerequisites"] = [np.bytes_(str(prereq)) for prereq in data_arr.attrs["prerequisites"]]


def _add_history(attrs):
"""Add 'history' attribute to dictionary."""
_history_create = "Created by pytroll/satpy on {}".format(datetime.datetime.utcnow())
if "history" in attrs:
if isinstance(attrs["history"], list):
attrs["history"] = "".join(attrs["history"])
attrs["history"] += "\n" + _history_create
else:
attrs["history"] = _history_create
return attrs


def preprocess_header_attrs(header_attrs, flatten_attrs=False):
"""Prepare file header attributes."""
if header_attrs is not None:
if flatten_attrs:
header_attrs = flatten_dict(header_attrs)
header_attrs = _encode_nc_attrs(header_attrs) # OrderedDict
else:
header_attrs = {}
header_attrs = _add_history(header_attrs)
return header_attrs

0 comments on commit 72e2a71

Please sign in to comment.