Merge pull request #2052 from djhoese/feature-modisl1b-dyn-chunks

Add resolution dependent chunk sizing to 'modis_l1b' reader
pytroll · Oct 5, 2023 · 8d68425 · 8d68425
2 parents 666dcaa + 2036251
commit 8d68425
Show file tree

Hide file tree

Showing 12 changed files with 303 additions and 353 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
           - types-pkg-resources
           - types-PyYAML
           - types-requests
-        args: ["--python-version", "3.8", "--ignore-missing-imports"]
+        args: ["--python-version", "3.9", "--ignore-missing-imports"]
   - repo: https://github.com/pycqa/isort
     rev: 5.12.0
     hooks:

diff --git a/satpy/_compat.py b/satpy/_compat.py
@@ -17,80 +17,11 @@
 # satpy.  If not, see <http://www.gnu.org/licenses/>.
 """Backports and compatibility fixes for satpy."""
 
-from threading import RLock
-
-_NOT_FOUND = object()
-
-
-class CachedPropertyBackport:
-    """Backport of cached_property from Python-3.8.
-
-    Source: https://github.com/python/cpython/blob/v3.8.0/Lib/functools.py#L930
-    """
-
-    def __init__(self, func):  # noqa
-        self.func = func
-        self.attrname = None
-        self.__doc__ = func.__doc__
-        self.lock = RLock()
-
-    def __set_name__(self, owner, name):  # noqa
-        if self.attrname is None:
-            self.attrname = name
-        elif name != self.attrname:
-            raise TypeError(
-                "Cannot assign the same cached_property to two different names "
-                f"({self.attrname!r} and {name!r})."
-            )
-
-    def __get__(self, instance, owner=None):  # noqa
-        if instance is None:
-            return self
-        if self.attrname is None:
-            raise TypeError(
-                "Cannot use cached_property instance without calling __set_name__ on it.")
-        try:
-            cache = instance.__dict__  # noqa
-        except AttributeError:  # not all objects have __dict__ (e.g. class defines slots)
-            msg = (
-                f"No '__dict__' attribute on {type(instance).__name__!r} "
-                f"instance to cache {self.attrname!r} property."
-            )
-            raise TypeError(msg) from None
-        val = cache.get(self.attrname, _NOT_FOUND)
-        if val is _NOT_FOUND:
-            with self.lock:
-                # check if another thread filled cache while we awaited lock
-                val = cache.get(self.attrname, _NOT_FOUND)
-                if val is _NOT_FOUND:
-                    val = self.func(instance)
-                    try:
-                        cache[self.attrname] = val
-                    except TypeError:
-                        msg = (
-                            f"The '__dict__' attribute on {type(instance).__name__!r} instance "
-                            f"does not support item assignment for caching {self.attrname!r} property."
-                        )
-                        raise TypeError(msg) from None
-        return val
-
-
-try:
-    from functools import cached_property  # type: ignore
-except ImportError:
-    # for python < 3.8
-    cached_property = CachedPropertyBackport  # type: ignore
-
+from functools import cache, cached_property  # noqa
 
 try:
     from numpy.typing import ArrayLike, DTypeLike  # noqa
 except ImportError:
     # numpy <1.20
     from numpy import dtype as DTypeLike  # noqa
     from numpy import ndarray as ArrayLike  # noqa
-
-
-try:
-    from functools import cache  # type: ignore
-except ImportError:
-    from functools import lru_cache as cache  # noqa
diff --git a/satpy/_config.py b/satpy/_config.py
@@ -26,20 +26,9 @@
 import tempfile
 from collections import OrderedDict
 from importlib.metadata import EntryPoint, entry_points
-from pathlib import Path
+from importlib.resources import files as impr_files
 from typing import Iterable
 
-try:
-    from importlib.resources import files as impr_files  # type: ignore
-except ImportError:
-    # Python 3.8
-    def impr_files(module_name: str) -> Path:
-        """Get path to module as a backport for Python 3.8."""
-        from importlib.resources import path as impr_path
-
-        with impr_path(module_name, "__init__.py") as pkg_init_path:
-            return pkg_init_path.parent
-
 import appdirs
 from donfig import Config
 

diff --git a/satpy/etc/readers/modis_l1b.yaml b/satpy/etc/readers/modis_l1b.yaml
@@ -9,15 +9,6 @@ reader:
   sensors: [modis]
   default_datasets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]
 
-navigations:
-  hdf_eos_geo:
-      description: MODIS navigation
-      file_type: hdf_eos_geo
-      latitude_key: Latitude
-      longitude_key: Longitude
-      nadir_resolution: [1000]
-      rows_per_scan: 10
-
 datasets:
   '1':
     name: '1'

diff --git a/satpy/readers/hdfeos_base.py b/satpy/readers/hdfeos_base.py
@@ -32,10 +32,9 @@
 
 from satpy import DataID
 from satpy.readers.file_handlers import BaseFileHandler
-from satpy.utils import get_legacy_chunk_size
+from satpy.utils import normalize_low_res_chunks
 
 logger = logging.getLogger(__name__)
-CHUNK_SIZE = get_legacy_chunk_size()
 
 
 def interpolate(clons, clats, csatz, src_resolution, dst_resolution):
@@ -215,14 +214,41 @@ def load_dataset(self, dataset_name, is_category=False):
         from satpy.readers.hdf4_utils import from_sds
 
         dataset = self._read_dataset_in_file(dataset_name)
-        dask_arr = from_sds(dataset, chunks=CHUNK_SIZE)
+        chunks = self._chunks_for_variable(dataset)
+        dask_arr = from_sds(dataset, chunks=chunks)
         dims = ('y', 'x') if dask_arr.ndim == 2 else None
         data = xr.DataArray(dask_arr, dims=dims,
                             attrs=dataset.attributes())
         data = self._scale_and_mask_data_array(data, is_category=is_category)
 
         return data
 
+    def _chunks_for_variable(self, hdf_dataset):
+        scan_length_250m = 40
+        var_shape = hdf_dataset.info()[2]
+        res_multiplier = self._get_res_multiplier(var_shape)
+        num_nonyx_dims = len(var_shape) - 2
+        return normalize_low_res_chunks(
+            (1,) * num_nonyx_dims + ("auto", -1),
+            var_shape,
+            (1,) * num_nonyx_dims + (scan_length_250m, -1),
+            (1,) * num_nonyx_dims + (res_multiplier, res_multiplier),
+            np.float32,
+        )
+
+    @staticmethod
+    def _get_res_multiplier(var_shape):
+        num_columns_to_multiplier = {
+            271: 20,  # 5km
+            1354: 4,  # 1km
+            2708: 2,  # 500m
+            5416: 1,  # 250m
+        }
+        for max_columns, res_multiplier in num_columns_to_multiplier.items():
+            if var_shape[-1] <= max_columns:
+                return res_multiplier
+        return 1
+
     def _scale_and_mask_data_array(self, data, is_category=False):
         """Unscale byte data and mask invalid/fill values.
 

diff --git a/satpy/readers/modis_l1b.py b/satpy/readers/modis_l1b.py
@@ -78,10 +78,8 @@
 
 from satpy.readers.hdf4_utils import from_sds
 from satpy.readers.hdfeos_base import HDFEOSBaseFileReader, HDFEOSGeoReader
-from satpy.utils import get_legacy_chunk_size
 
 logger = logging.getLogger(__name__)
-CHUNK_SIZE = get_legacy_chunk_size()
 
 
 class HDFEOSBandReader(HDFEOSBaseFileReader):
@@ -118,7 +116,8 @@ def get_dataset(self, key, info):
         subdata = self.sd.select(var_name)
         var_attrs = subdata.attributes()
         uncertainty = self.sd.select(var_name + "_Uncert_Indexes")
-        array = xr.DataArray(from_sds(subdata, chunks=CHUNK_SIZE)[band_index, :, :],
+        chunks = self._chunks_for_variable(subdata)
+        array = xr.DataArray(from_sds(subdata, chunks=chunks)[band_index, :, :],
                              dims=['y', 'x']).astype(np.float32)
         valid_range = var_attrs['valid_range']
         valid_min = np.float32(valid_range[0])
@@ -214,7 +213,8 @@ def _mask_invalid(self, array, valid_min, valid_max):
     def _mask_uncertain_pixels(self, array, uncertainty, band_index):
         if not self._mask_saturated:
             return array
-        band_uncertainty = from_sds(uncertainty, chunks=CHUNK_SIZE)[band_index, :, :]
+        uncertainty_chunks = self._chunks_for_variable(uncertainty)
+        band_uncertainty = from_sds(uncertainty, chunks=uncertainty_chunks)[band_index, :, :]
         array = array.where(band_uncertainty < 15)
         return array
 

diff --git a/satpy/resample.py b/satpy/resample.py
@@ -143,6 +143,7 @@
 import os
 import warnings
 from logging import getLogger
+from math import lcm  # type: ignore
 from weakref import WeakValueDictionary
 
 import dask
@@ -157,14 +158,6 @@
 
 from satpy.utils import PerformanceWarning, get_legacy_chunk_size
 
-try:
-    from math import lcm  # type: ignore
-except ImportError:
-    def lcm(a, b):
-        """Get 'Least Common Multiple' with Python 3.8 compatibility."""
-        from math import gcd
-        return abs(a * b) // gcd(a, b)
-
 try:
     from pyresample.resampler import BaseResampler as PRBaseResampler
 except ImportError:

diff --git a/satpy/tests/reader_tests/modis_tests/_modis_fixtures.py b/satpy/tests/reader_tests/modis_tests/_modis_fixtures.py
@@ -62,7 +62,7 @@ def _shape_for_resolution(resolution: int) -> tuple[int, int]:
     return factor * shape_1km[0], factor * shape_1km[1]
 
 
-def _generate_lonlat_data(resolution: int) -> np.ndarray:
+def _generate_lonlat_data(resolution: int) -> tuple[np.ndarray, np.ndarray]:
     shape = _shape_for_resolution(resolution)
     lat = np.repeat(np.linspace(35., 45., shape[0])[:, None], shape[1], 1)
     lat *= np.linspace(0.9, 1.1, shape[1])

diff --git a/satpy/tests/reader_tests/modis_tests/test_modis_l1b.py b/satpy/tests/reader_tests/modis_tests/test_modis_l1b.py
@@ -51,6 +51,18 @@ def _check_shared_metadata(data_arr):
     assert "rows_per_scan" in data_arr.attrs
     assert isinstance(data_arr.attrs["rows_per_scan"], int)
     assert data_arr.attrs['reader'] == 'modis_l1b'
+    assert "resolution" in data_arr.attrs
+    res = data_arr.attrs["resolution"]
+    if res == 5000:
+        assert data_arr.chunks == ((2, 2, 2), (data_arr.shape[1],))
+    elif res == 1000:
+        assert data_arr.chunks == ((10, 10, 10), (data_arr.shape[1],))
+    elif res == 500:
+        assert data_arr.chunks == ((20, 20, 20), (data_arr.shape[1],))
+    elif res == 250:
+        assert data_arr.chunks == ((40, 40, 40), (data_arr.shape[1],))
+    else:
+        raise ValueError(f"Unexpected resolution: {res}")
 
 
 def _load_and_check_geolocation(scene, resolution, exp_res, exp_shape, has_res,
@@ -147,7 +159,8 @@ def test_load_longitude_latitude(self, input_files, has_5km, has_500, has_250, d
         shape_500m = _shape_for_resolution(500)
         shape_250m = _shape_for_resolution(250)
         default_shape = _shape_for_resolution(default_res)
-        with dask.config.set(scheduler=CustomScheduler(max_computes=1 + has_5km + has_500 + has_250)):
+        scheduler = CustomScheduler(max_computes=1 + has_5km + has_500 + has_250)
+        with dask.config.set({'scheduler': scheduler, 'array.chunk-size': '1 MiB'}):
             _load_and_check_geolocation(scene, "*", default_res, default_shape, True)
             _load_and_check_geolocation(scene, 5000, 5000, shape_5km, has_5km)
             _load_and_check_geolocation(scene, 500, 500, shape_500m, has_500)
@@ -157,7 +170,8 @@ def test_load_sat_zenith_angle(self, modis_l1b_nasa_mod021km_file):
         """Test loading satellite zenith angle band."""
         scene = Scene(reader='modis_l1b', filenames=modis_l1b_nasa_mod021km_file)
         dataset_name = 'satellite_zenith_angle'
-        scene.load([dataset_name])
+        with dask.config.set({'array.chunk-size': '1 MiB'}):
+            scene.load([dataset_name])
         dataset = scene[dataset_name]
         assert dataset.shape == _shape_for_resolution(1000)
         assert dataset.attrs['resolution'] == 1000
@@ -167,7 +181,8 @@ def test_load_vis(self, modis_l1b_nasa_mod021km_file):
         """Test loading visible band."""
         scene = Scene(reader='modis_l1b', filenames=modis_l1b_nasa_mod021km_file)
         dataset_name = '1'
-        scene.load([dataset_name])
+        with dask.config.set({'array.chunk-size': '1 MiB'}):
+            scene.load([dataset_name])
         dataset = scene[dataset_name]
         assert dataset[0, 0] == 300.0
         assert dataset.shape == _shape_for_resolution(1000)
@@ -180,7 +195,8 @@ def test_load_vis_saturation(self, mask_saturated, modis_l1b_nasa_mod021km_file)
         scene = Scene(reader='modis_l1b', filenames=modis_l1b_nasa_mod021km_file,
                       reader_kwargs={"mask_saturated": mask_saturated})
         dataset_name = '2'
-        scene.load([dataset_name])
+        with dask.config.set({'array.chunk-size': '1 MiB'}):
+            scene.load([dataset_name])
         dataset = scene[dataset_name]
         assert dataset.shape == _shape_for_resolution(1000)
         assert dataset.attrs['resolution'] == 1000

diff --git a/satpy/tests/test_compat.py b/satpy/tests/test_compat.py