From 09402890bcc54f0465073299176f82e272d4bb2c Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 25 May 2020 16:12:45 +0200 Subject: [PATCH 1/8] Regression test for combine_arrays failure Added a regression test triggering the ValueError raised when trying to read a FCI composite (see #1215). --- satpy/tests/test_dataset.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py index f72bb147e5..7de12f0303 100644 --- a/satpy/tests/test_dataset.py +++ b/satpy/tests/test_dataset.py @@ -84,3 +84,17 @@ def test_combine_times(self): ret = combine_metadata(*dts, average_times=False) # times are not equal so don't include it in the final result self.assertNotIn('start_time', ret) + + def test_combine_arrays(self): + """Test the combine_metadata with arrays.""" + from satpy.dataset import combine_metadata + from numpy import arange, ones + from xarray import DataArray + dts = [ + {"quality": (arange(25) % 2).reshape(5, 5).astype("?")}, + {"quality": (arange(1, 26) % 3).reshape(5, 5).astype("?")}, + {"quality": ones((5, 5,), "?")}, + ] + combine_metadata(*dts) + dts2 = [{"quality": DataArray(d["quality"])} for d in dts] + combine_metadata(*dts2) From b3973721b4bae2f70b2d4a8879dc35dd1376cf71 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 25 May 2020 16:17:39 +0200 Subject: [PATCH 2/8] Replace isinstace check by array interface check In combine metadata, replace an ndarray instance check by a check for the array interface (attribute __array__). --- satpy/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/dataset.py b/satpy/dataset.py index db1055d385..38d972955d 100644 --- a/satpy/dataset.py +++ b/satpy/dataset.py @@ -98,7 +98,7 @@ def combine_metadata(*metadata_objects, **kwargs): shared_info = {} for k in shared_keys: values = [nfo[k] for nfo in info_dicts] - any_arrays = any([isinstance(val, np.ndarray) for val in values]) + any_arrays = any([hasattr(val, "__array__") for val in values]) if any_arrays: if all(np.all(val == values[0]) for val in values[1:]): shared_info[k] = values[0] From bb87b3ceb1dabe08715c7e60af1eeb6fd1885068 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 25 May 2020 16:19:27 +0200 Subject: [PATCH 3/8] Added regression test better simulating FCI Added another regression test to combine_metadata, that better simulates the situation with the FCI reader. The ancillary_variables attribute is actually List[xarray.DataArray], so this needs to be handled as well. --- satpy/tests/test_dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py index 7de12f0303..d224731c6c 100644 --- a/satpy/tests/test_dataset.py +++ b/satpy/tests/test_dataset.py @@ -98,3 +98,6 @@ def test_combine_arrays(self): combine_metadata(*dts) dts2 = [{"quality": DataArray(d["quality"])} for d in dts] combine_metadata(*dts2) + # the ancillary_variables attribute is actually a list of data arrays + dts3 = [{"quality": [d["quality"]]} for d in dts] + combine_metadata(*dts3) From 479eccf6c0a15979c5f20887e2a1a988b5f80ec5 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 25 May 2020 16:33:44 +0200 Subject: [PATCH 4/8] Cover List[xarray.DataArray] in combine_metadata In combine_metadata, cover lists of arrays such as happen when trying to read an FCI composite. --- satpy/dataset.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/satpy/dataset.py b/satpy/dataset.py index 38d972955d..aa87180a71 100644 --- a/satpy/dataset.py +++ b/satpy/dataset.py @@ -17,10 +17,10 @@ # satpy. If not, see . """Dataset objects.""" -import sys import logging import numbers from collections import namedtuple +from collections.abc import Collection from datetime import datetime import numpy as np @@ -99,9 +99,19 @@ def combine_metadata(*metadata_objects, **kwargs): for k in shared_keys: values = [nfo[k] for nfo in info_dicts] any_arrays = any([hasattr(val, "__array__") for val in values]) + # in the real world, the `ancillary_variables` attribute may be + # List[xarray.DataArray], this means our values are now + # List[List[xarray.DataArray]]. + any_list_of_arrays = any( + [isinstance(val, Collection) and all([hasattr(subval, "__array__") + for subval in val]) + for val in values]) if any_arrays: if all(np.all(val == values[0]) for val in values[1:]): shared_info[k] = values[0] + elif any_list_of_arrays: + if all(np.array_equal(val, values[0]) for val in values[1]): + shared_info[k] = values[0] elif 'time' in k and isinstance(values[0], datetime) and average_times: shared_info[k] = average_datetimes(values) elif all(val == values[0] for val in values[1:]): From 6f58f830a55c176c59f07a44e12bb5b6b27de0c2 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 25 May 2020 16:37:32 +0200 Subject: [PATCH 5/8] Don't catch empty collections The list of arrays was catching too much, such as [()]. Only catch lists of arrays when they are non-empty. --- satpy/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/satpy/dataset.py b/satpy/dataset.py index aa87180a71..c63182f2b4 100644 --- a/satpy/dataset.py +++ b/satpy/dataset.py @@ -103,7 +103,8 @@ def combine_metadata(*metadata_objects, **kwargs): # List[xarray.DataArray], this means our values are now # List[List[xarray.DataArray]]. any_list_of_arrays = any( - [isinstance(val, Collection) and all([hasattr(subval, "__array__") + [isinstance(val, Collection) and len(val) > 0 and + all([hasattr(subval, "__array__") for subval in val]) for val in values]) if any_arrays: From be63cc8add7fe5734d08b8d22149307026442674 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Mon, 25 May 2020 16:39:34 +0200 Subject: [PATCH 6/8] PEP8 fix PEP 8 fix, I never know where it wants those pesky list generators indented! --- satpy/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/satpy/dataset.py b/satpy/dataset.py index c63182f2b4..48c8b92485 100644 --- a/satpy/dataset.py +++ b/satpy/dataset.py @@ -105,7 +105,7 @@ def combine_metadata(*metadata_objects, **kwargs): any_list_of_arrays = any( [isinstance(val, Collection) and len(val) > 0 and all([hasattr(subval, "__array__") - for subval in val]) + for subval in val]) for val in values]) if any_arrays: if all(np.all(val == values[0]) for val in values[1:]): From bec5d720dd9d76779f42abb1ba8984a3a04351ce Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 27 May 2020 11:31:03 +0200 Subject: [PATCH 7/8] Compare with object identity in combine_metadata In combine_metadata, compare arrays with object identity rather than by value, avoiding expensive computation. Refactor the combine_metadata function with three small helpers to reduce code complexity. Expand unit tests for combine_metadata. --- satpy/dataset.py | 76 ++++++++++++++++++++++++++----------- satpy/tests/test_dataset.py | 26 +++++++++++-- 2 files changed, 76 insertions(+), 26 deletions(-) diff --git a/satpy/dataset.py b/satpy/dataset.py index 48c8b92485..d50679cca0 100644 --- a/satpy/dataset.py +++ b/satpy/dataset.py @@ -62,11 +62,13 @@ def average_datetimes(dt_list): def combine_metadata(*metadata_objects, **kwargs): """Combine the metadata of two or more Datasets. - If any keys are not equal or do not exist in all provided dictionaries - then they are not included in the returned dictionary. - By default any keys with the word 'time' in them and consisting - of datetime objects will be averaged. This is to handle cases where - data were observed at almost the same time but not exactly. + If the values corresponding to any keys are not equal or do not + exist in all provided dictionaries then they are not included in + the returned dictionary. By default any keys with the word 'time' + in them and consisting of datetime objects will be averaged. This + is to handle cases where data were observed at almost the same time + but not exactly. In the interest of time, arrays are compared by + object identity rather than by their contents. Args: *metadata_objects: MetadataObject or dict objects to combine @@ -98,29 +100,57 @@ def combine_metadata(*metadata_objects, **kwargs): shared_info = {} for k in shared_keys: values = [nfo[k] for nfo in info_dicts] - any_arrays = any([hasattr(val, "__array__") for val in values]) - # in the real world, the `ancillary_variables` attribute may be - # List[xarray.DataArray], this means our values are now - # List[List[xarray.DataArray]]. - any_list_of_arrays = any( - [isinstance(val, Collection) and len(val) > 0 and - all([hasattr(subval, "__array__") - for subval in val]) - for val in values]) - if any_arrays: - if all(np.all(val == values[0]) for val in values[1:]): + if _share_metadata_key(k, values, average_times): + if 'time' in k and isinstance(values[0], datetime) and average_times: + shared_info[k] = average_datetimes(values) + else: shared_info[k] = values[0] - elif any_list_of_arrays: - if all(np.array_equal(val, values[0]) for val in values[1]): - shared_info[k] = values[0] - elif 'time' in k and isinstance(values[0], datetime) and average_times: - shared_info[k] = average_datetimes(values) - elif all(val == values[0] for val in values[1:]): - shared_info[k] = values[0] return shared_info +def _share_metadata_key(k, values, average_times): + """Helper for combine_metadata, decide if key is shared.""" + any_arrays = any([hasattr(val, "__array__") for val in values]) + # in the real world, the `ancillary_variables` attribute may be + # List[xarray.DataArray], this means our values are now + # List[List[xarray.DataArray]]. + # note that this list_of_arrays check is also true for any + # higher-dimensional ndarray, but we only use this check after we have + # checked any_arrays so this false positive should have no impact + list_of_arrays = any( + [isinstance(val, Collection) and len(val) > 0 and + all([hasattr(subval, "__array__") + for subval in val]) + for val in values]) + if any_arrays: + return _share_metadata_key_array(values) + elif list_of_arrays: + return _share_metadata_key_list_arrays(values) + elif 'time' in k and isinstance(values[0], datetime) and average_times: + return True + elif all(val == values[0] for val in values[1:]): + return True + return False + + +def _share_metadata_key_array(values): + """Helper for combine_metadata, check object identity in list of arrays.""" + for val in values[1:]: + if val is not values[0]: + return False + return True + + +def _share_metadata_key_list_arrays(values): + """Helper for combine_metadata, check object identity in list of list of arrays.""" + for val in values[1:]: + for arr, ref in zip(val, values[0]): + if arr is not ref: + return False + return True + + DATASET_KEYS = ("name", "wavelength", "resolution", "polarization", "calibration", "level", "modifiers") DatasetID = namedtuple("DatasetID", " ".join(DATASET_KEYS)) diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py index d224731c6c..cb02b7f8cd 100644 --- a/satpy/tests/test_dataset.py +++ b/satpy/tests/test_dataset.py @@ -95,9 +95,29 @@ def test_combine_arrays(self): {"quality": (arange(1, 26) % 3).reshape(5, 5).astype("?")}, {"quality": ones((5, 5,), "?")}, ] - combine_metadata(*dts) + assert "quality" not in combine_metadata(*dts) dts2 = [{"quality": DataArray(d["quality"])} for d in dts] - combine_metadata(*dts2) + assert "quality" not in combine_metadata(*dts2) # the ancillary_variables attribute is actually a list of data arrays dts3 = [{"quality": [d["quality"]]} for d in dts] - combine_metadata(*dts3) + assert "quality" not in combine_metadata(*dts3) + # check cases with repeated arrays + dts4 = [ + {"quality": dts[0]["quality"]}, + {"quality": dts[0]["quality"]}, + ] + assert "quality" in combine_metadata(*dts4) + dts5 = [ + {"quality": dts3[0]["quality"]}, + {"quality": dts3[0]["quality"]}, + ] + assert "quality" in combine_metadata(*dts5) + # check with other types + dts6 = [ + DataArray(arange(5), attrs=dts[0]), + DataArray(arange(5), attrs=dts[0]), + DataArray(arange(5), attrs=dts[1]), + object() + ] + #breakpoint() + assert "quality" not in combine_metadata(*dts6) From f307d98903c70551244922a6cdccec0a7411c191 Mon Sep 17 00:00:00 2001 From: Gerrit Holl Date: Wed, 27 May 2020 11:34:06 +0200 Subject: [PATCH 8/8] PEP8 fixes PEP 8 fixes, forgot to remove #breakpoint() from test code. --- satpy/dataset.py | 2 -- satpy/tests/test_dataset.py | 1 - 2 files changed, 3 deletions(-) diff --git a/satpy/dataset.py b/satpy/dataset.py index d50679cca0..14f2042224 100644 --- a/satpy/dataset.py +++ b/satpy/dataset.py @@ -23,8 +23,6 @@ from collections.abc import Collection from datetime import datetime -import numpy as np - logger = logging.getLogger(__name__) diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py index cb02b7f8cd..82bd360dc2 100644 --- a/satpy/tests/test_dataset.py +++ b/satpy/tests/test_dataset.py @@ -119,5 +119,4 @@ def test_combine_arrays(self): DataArray(arange(5), attrs=dts[1]), object() ] - #breakpoint() assert "quality" not in combine_metadata(*dts6)