From 09402890bcc54f0465073299176f82e272d4bb2c Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Mon, 25 May 2020 16:12:45 +0200
Subject: [PATCH 1/8] Regression test for combine_arrays failure

Added a regression test triggering the ValueError raised when trying to
read a FCI composite (see #1215).
---
 satpy/tests/test_dataset.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py
index f72bb147e5..7de12f0303 100644
--- a/satpy/tests/test_dataset.py
+++ b/satpy/tests/test_dataset.py
@@ -84,3 +84,17 @@ def test_combine_times(self):
         ret = combine_metadata(*dts, average_times=False)
         # times are not equal so don't include it in the final result
         self.assertNotIn('start_time', ret)
+
+    def test_combine_arrays(self):
+        """Test the combine_metadata with arrays."""
+        from satpy.dataset import combine_metadata
+        from numpy import arange, ones
+        from xarray import DataArray
+        dts = [
+                {"quality": (arange(25) % 2).reshape(5, 5).astype("?")},
+                {"quality": (arange(1, 26) % 3).reshape(5, 5).astype("?")},
+                {"quality": ones((5, 5,), "?")},
+        ]
+        combine_metadata(*dts)
+        dts2 = [{"quality": DataArray(d["quality"])} for d in dts]
+        combine_metadata(*dts2)

From b3973721b4bae2f70b2d4a8879dc35dd1376cf71 Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Mon, 25 May 2020 16:17:39 +0200
Subject: [PATCH 2/8] Replace isinstace check by array interface check

In combine metadata, replace an ndarray instance check by a check for
the array interface (attribute __array__).
---
 satpy/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/satpy/dataset.py b/satpy/dataset.py
index db1055d385..38d972955d 100644
--- a/satpy/dataset.py
+++ b/satpy/dataset.py
@@ -98,7 +98,7 @@ def combine_metadata(*metadata_objects, **kwargs):
     shared_info = {}
     for k in shared_keys:
         values = [nfo[k] for nfo in info_dicts]
-        any_arrays = any([isinstance(val, np.ndarray) for val in values])
+        any_arrays = any([hasattr(val, "__array__") for val in values])
         if any_arrays:
             if all(np.all(val == values[0]) for val in values[1:]):
                 shared_info[k] = values[0]

From bb87b3ceb1dabe08715c7e60af1eeb6fd1885068 Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Mon, 25 May 2020 16:19:27 +0200
Subject: [PATCH 3/8] Added regression test better simulating FCI

Added another regression test to combine_metadata, that better
simulates the situation with the FCI reader.  The ancillary_variables
attribute is actually List[xarray.DataArray], so this needs to be
handled as well.
---
 satpy/tests/test_dataset.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py
index 7de12f0303..d224731c6c 100644
--- a/satpy/tests/test_dataset.py
+++ b/satpy/tests/test_dataset.py
@@ -98,3 +98,6 @@ def test_combine_arrays(self):
         combine_metadata(*dts)
         dts2 = [{"quality": DataArray(d["quality"])} for d in dts]
         combine_metadata(*dts2)
+        # the ancillary_variables attribute is actually a list of data arrays
+        dts3 = [{"quality": [d["quality"]]} for d in dts]
+        combine_metadata(*dts3)

From 479eccf6c0a15979c5f20887e2a1a988b5f80ec5 Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Mon, 25 May 2020 16:33:44 +0200
Subject: [PATCH 4/8] Cover List[xarray.DataArray] in combine_metadata

In combine_metadata, cover lists of arrays such as happen when trying to
read an FCI composite.
---
 satpy/dataset.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/satpy/dataset.py b/satpy/dataset.py
index 38d972955d..aa87180a71 100644
--- a/satpy/dataset.py
+++ b/satpy/dataset.py
@@ -17,10 +17,10 @@
 # satpy.  If not, see <http://www.gnu.org/licenses/>.
 """Dataset objects."""
 
-import sys
 import logging
 import numbers
 from collections import namedtuple
+from collections.abc import Collection
 from datetime import datetime
 
 import numpy as np
@@ -99,9 +99,19 @@ def combine_metadata(*metadata_objects, **kwargs):
     for k in shared_keys:
         values = [nfo[k] for nfo in info_dicts]
         any_arrays = any([hasattr(val, "__array__") for val in values])
+        # in the real world, the `ancillary_variables` attribute may be
+        # List[xarray.DataArray], this means our values are now
+        # List[List[xarray.DataArray]].
+        any_list_of_arrays = any(
+                [isinstance(val, Collection) and all([hasattr(subval, "__array__")
+                 for subval in val])
+                 for val in values])
         if any_arrays:
             if all(np.all(val == values[0]) for val in values[1:]):
                 shared_info[k] = values[0]
+        elif any_list_of_arrays:
+            if all(np.array_equal(val, values[0]) for val in values[1]):
+                shared_info[k] = values[0]
         elif 'time' in k and isinstance(values[0], datetime) and average_times:
             shared_info[k] = average_datetimes(values)
         elif all(val == values[0] for val in values[1:]):

From 6f58f830a55c176c59f07a44e12bb5b6b27de0c2 Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Mon, 25 May 2020 16:37:32 +0200
Subject: [PATCH 5/8] Don't catch empty collections

The list of arrays was catching too much, such as [()].  Only catch
lists of arrays when they are non-empty.
---
 satpy/dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/satpy/dataset.py b/satpy/dataset.py
index aa87180a71..c63182f2b4 100644
--- a/satpy/dataset.py
+++ b/satpy/dataset.py
@@ -103,7 +103,8 @@ def combine_metadata(*metadata_objects, **kwargs):
         # List[xarray.DataArray], this means our values are now
         # List[List[xarray.DataArray]].
         any_list_of_arrays = any(
-                [isinstance(val, Collection) and all([hasattr(subval, "__array__")
+                [isinstance(val, Collection) and len(val) > 0 and
+                 all([hasattr(subval, "__array__")
                  for subval in val])
                  for val in values])
         if any_arrays:

From be63cc8add7fe5734d08b8d22149307026442674 Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Mon, 25 May 2020 16:39:34 +0200
Subject: [PATCH 6/8] PEP8 fix

PEP 8 fix, I never know where it wants those pesky list generators
indented!
---
 satpy/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/satpy/dataset.py b/satpy/dataset.py
index c63182f2b4..48c8b92485 100644
--- a/satpy/dataset.py
+++ b/satpy/dataset.py
@@ -105,7 +105,7 @@ def combine_metadata(*metadata_objects, **kwargs):
         any_list_of_arrays = any(
                 [isinstance(val, Collection) and len(val) > 0 and
                  all([hasattr(subval, "__array__")
-                 for subval in val])
+                     for subval in val])
                  for val in values])
         if any_arrays:
             if all(np.all(val == values[0]) for val in values[1:]):

From bec5d720dd9d76779f42abb1ba8984a3a04351ce Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Wed, 27 May 2020 11:31:03 +0200
Subject: [PATCH 7/8] Compare with object identity in combine_metadata

In combine_metadata, compare arrays with object identity rather than by
value, avoiding expensive computation.  Refactor the combine_metadata
function with three small helpers to reduce code complexity.  Expand
unit tests for combine_metadata.
---
 satpy/dataset.py            | 76 ++++++++++++++++++++++++++-----------
 satpy/tests/test_dataset.py | 26 +++++++++++--
 2 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/satpy/dataset.py b/satpy/dataset.py
index 48c8b92485..d50679cca0 100644
--- a/satpy/dataset.py
+++ b/satpy/dataset.py
@@ -62,11 +62,13 @@ def average_datetimes(dt_list):
 def combine_metadata(*metadata_objects, **kwargs):
     """Combine the metadata of two or more Datasets.
 
-    If any keys are not equal or do not exist in all provided dictionaries
-    then they are not included in the returned dictionary.
-    By default any keys with the word 'time' in them and consisting
-    of datetime objects will be averaged. This is to handle cases where
-    data were observed at almost the same time but not exactly.
+    If the values corresponding to any keys are not equal or do not
+    exist in all provided dictionaries then they are not included in
+    the returned dictionary.  By default any keys with the word 'time'
+    in them and consisting of datetime objects will be averaged. This
+    is to handle cases where data were observed at almost the same time
+    but not exactly.  In the interest of time, arrays are compared by
+    object identity rather than by their contents.
 
     Args:
         *metadata_objects: MetadataObject or dict objects to combine
@@ -98,29 +100,57 @@ def combine_metadata(*metadata_objects, **kwargs):
     shared_info = {}
     for k in shared_keys:
         values = [nfo[k] for nfo in info_dicts]
-        any_arrays = any([hasattr(val, "__array__") for val in values])
-        # in the real world, the `ancillary_variables` attribute may be
-        # List[xarray.DataArray], this means our values are now
-        # List[List[xarray.DataArray]].
-        any_list_of_arrays = any(
-                [isinstance(val, Collection) and len(val) > 0 and
-                 all([hasattr(subval, "__array__")
-                     for subval in val])
-                 for val in values])
-        if any_arrays:
-            if all(np.all(val == values[0]) for val in values[1:]):
+        if _share_metadata_key(k, values, average_times):
+            if 'time' in k and isinstance(values[0], datetime) and average_times:
+                shared_info[k] = average_datetimes(values)
+            else:
                 shared_info[k] = values[0]
-        elif any_list_of_arrays:
-            if all(np.array_equal(val, values[0]) for val in values[1]):
-                shared_info[k] = values[0]
-        elif 'time' in k and isinstance(values[0], datetime) and average_times:
-            shared_info[k] = average_datetimes(values)
-        elif all(val == values[0] for val in values[1:]):
-            shared_info[k] = values[0]
 
     return shared_info
 
 
+def _share_metadata_key(k, values, average_times):
+    """Helper for combine_metadata, decide if key is shared."""
+    any_arrays = any([hasattr(val, "__array__") for val in values])
+    # in the real world, the `ancillary_variables` attribute may be
+    # List[xarray.DataArray], this means our values are now
+    # List[List[xarray.DataArray]].
+    # note that this list_of_arrays check is also true for any
+    # higher-dimensional ndarray, but we only use this check after we have
+    # checked any_arrays so this false positive should have no impact
+    list_of_arrays = any(
+            [isinstance(val, Collection) and len(val) > 0 and
+             all([hasattr(subval, "__array__")
+                 for subval in val])
+             for val in values])
+    if any_arrays:
+        return _share_metadata_key_array(values)
+    elif list_of_arrays:
+        return _share_metadata_key_list_arrays(values)
+    elif 'time' in k and isinstance(values[0], datetime) and average_times:
+        return True
+    elif all(val == values[0] for val in values[1:]):
+        return True
+    return False
+
+
+def _share_metadata_key_array(values):
+    """Helper for combine_metadata, check object identity in list of arrays."""
+    for val in values[1:]:
+        if val is not values[0]:
+            return False
+    return True
+
+
+def _share_metadata_key_list_arrays(values):
+    """Helper for combine_metadata, check object identity in list of list of arrays."""
+    for val in values[1:]:
+        for arr, ref in zip(val, values[0]):
+            if arr is not ref:
+                return False
+    return True
+
+
 DATASET_KEYS = ("name", "wavelength", "resolution", "polarization",
                 "calibration", "level", "modifiers")
 DatasetID = namedtuple("DatasetID", " ".join(DATASET_KEYS))
diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py
index d224731c6c..cb02b7f8cd 100644
--- a/satpy/tests/test_dataset.py
+++ b/satpy/tests/test_dataset.py
@@ -95,9 +95,29 @@ def test_combine_arrays(self):
                 {"quality": (arange(1, 26) % 3).reshape(5, 5).astype("?")},
                 {"quality": ones((5, 5,), "?")},
         ]
-        combine_metadata(*dts)
+        assert "quality" not in combine_metadata(*dts)
         dts2 = [{"quality": DataArray(d["quality"])} for d in dts]
-        combine_metadata(*dts2)
+        assert "quality" not in combine_metadata(*dts2)
         # the ancillary_variables attribute is actually a list of data arrays
         dts3 = [{"quality": [d["quality"]]} for d in dts]
-        combine_metadata(*dts3)
+        assert "quality" not in combine_metadata(*dts3)
+        # check cases with repeated arrays
+        dts4 = [
+                {"quality": dts[0]["quality"]},
+                {"quality": dts[0]["quality"]},
+                ]
+        assert "quality" in combine_metadata(*dts4)
+        dts5 = [
+                {"quality": dts3[0]["quality"]},
+                {"quality": dts3[0]["quality"]},
+                ]
+        assert "quality" in combine_metadata(*dts5)
+        # check with other types
+        dts6 = [
+                DataArray(arange(5), attrs=dts[0]),
+                DataArray(arange(5), attrs=dts[0]),
+                DataArray(arange(5), attrs=dts[1]),
+                object()
+              ]
+        #breakpoint()
+        assert "quality" not in combine_metadata(*dts6)

From f307d98903c70551244922a6cdccec0a7411c191 Mon Sep 17 00:00:00 2001
From: Gerrit Holl <gerrit.holl@dwd.de>
Date: Wed, 27 May 2020 11:34:06 +0200
Subject: [PATCH 8/8] PEP8 fixes

PEP 8 fixes, forgot to remove #breakpoint() from test code.
---
 satpy/dataset.py            | 2 --
 satpy/tests/test_dataset.py | 1 -
 2 files changed, 3 deletions(-)

diff --git a/satpy/dataset.py b/satpy/dataset.py
index d50679cca0..14f2042224 100644
--- a/satpy/dataset.py
+++ b/satpy/dataset.py
@@ -23,8 +23,6 @@
 from collections.abc import Collection
 from datetime import datetime
 
-import numpy as np
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/satpy/tests/test_dataset.py b/satpy/tests/test_dataset.py
index cb02b7f8cd..82bd360dc2 100644
--- a/satpy/tests/test_dataset.py
+++ b/satpy/tests/test_dataset.py
@@ -119,5 +119,4 @@ def test_combine_arrays(self):
                 DataArray(arange(5), attrs=dts[1]),
                 object()
               ]
-        #breakpoint()
         assert "quality" not in combine_metadata(*dts6)