Skip to content

Commit

Permalink
Merge pull request #2052 from djhoese/feature-modisl1b-dyn-chunks
Browse files Browse the repository at this point in the history
Add resolution dependent chunk sizing to 'modis_l1b' reader
  • Loading branch information
mraspaud committed Oct 5, 2023
2 parents 666dcaa + 2036251 commit 8d68425
Show file tree
Hide file tree
Showing 12 changed files with 303 additions and 353 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ repos:
- types-pkg-resources
- types-PyYAML
- types-requests
args: ["--python-version", "3.8", "--ignore-missing-imports"]
args: ["--python-version", "3.9", "--ignore-missing-imports"]
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
Expand Down
71 changes: 1 addition & 70 deletions satpy/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,80 +17,11 @@
# satpy. If not, see <http://www.gnu.org/licenses/>.
"""Backports and compatibility fixes for satpy."""

from threading import RLock

_NOT_FOUND = object()


class CachedPropertyBackport:
"""Backport of cached_property from Python-3.8.
Source: https://github.com/python/cpython/blob/v3.8.0/Lib/functools.py#L930
"""

def __init__(self, func): # noqa
self.func = func
self.attrname = None
self.__doc__ = func.__doc__
self.lock = RLock()

def __set_name__(self, owner, name): # noqa
if self.attrname is None:
self.attrname = name
elif name != self.attrname:
raise TypeError(
"Cannot assign the same cached_property to two different names "
f"({self.attrname!r} and {name!r})."
)

def __get__(self, instance, owner=None): # noqa
if instance is None:
return self
if self.attrname is None:
raise TypeError(
"Cannot use cached_property instance without calling __set_name__ on it.")
try:
cache = instance.__dict__ # noqa
except AttributeError: # not all objects have __dict__ (e.g. class defines slots)
msg = (
f"No '__dict__' attribute on {type(instance).__name__!r} "
f"instance to cache {self.attrname!r} property."
)
raise TypeError(msg) from None
val = cache.get(self.attrname, _NOT_FOUND)
if val is _NOT_FOUND:
with self.lock:
# check if another thread filled cache while we awaited lock
val = cache.get(self.attrname, _NOT_FOUND)
if val is _NOT_FOUND:
val = self.func(instance)
try:
cache[self.attrname] = val
except TypeError:
msg = (
f"The '__dict__' attribute on {type(instance).__name__!r} instance "
f"does not support item assignment for caching {self.attrname!r} property."
)
raise TypeError(msg) from None
return val


try:
from functools import cached_property # type: ignore
except ImportError:
# for python < 3.8
cached_property = CachedPropertyBackport # type: ignore

from functools import cache, cached_property # noqa

try:
from numpy.typing import ArrayLike, DTypeLike # noqa
except ImportError:
# numpy <1.20
from numpy import dtype as DTypeLike # noqa
from numpy import ndarray as ArrayLike # noqa


try:
from functools import cache # type: ignore
except ImportError:
from functools import lru_cache as cache # noqa
13 changes: 1 addition & 12 deletions satpy/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,9 @@
import tempfile
from collections import OrderedDict
from importlib.metadata import EntryPoint, entry_points
from pathlib import Path
from importlib.resources import files as impr_files
from typing import Iterable

try:
from importlib.resources import files as impr_files # type: ignore
except ImportError:
# Python 3.8
def impr_files(module_name: str) -> Path:
"""Get path to module as a backport for Python 3.8."""
from importlib.resources import path as impr_path

with impr_path(module_name, "__init__.py") as pkg_init_path:
return pkg_init_path.parent

import appdirs
from donfig import Config

Expand Down
9 changes: 0 additions & 9 deletions satpy/etc/readers/modis_l1b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,6 @@ reader:
sensors: [modis]
default_datasets: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]

navigations:
hdf_eos_geo:
description: MODIS navigation
file_type: hdf_eos_geo
latitude_key: Latitude
longitude_key: Longitude
nadir_resolution: [1000]
rows_per_scan: 10

datasets:
'1':
name: '1'
Expand Down
32 changes: 29 additions & 3 deletions satpy/readers/hdfeos_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@

from satpy import DataID
from satpy.readers.file_handlers import BaseFileHandler
from satpy.utils import get_legacy_chunk_size
from satpy.utils import normalize_low_res_chunks

logger = logging.getLogger(__name__)
CHUNK_SIZE = get_legacy_chunk_size()


def interpolate(clons, clats, csatz, src_resolution, dst_resolution):
Expand Down Expand Up @@ -215,14 +214,41 @@ def load_dataset(self, dataset_name, is_category=False):
from satpy.readers.hdf4_utils import from_sds

dataset = self._read_dataset_in_file(dataset_name)
dask_arr = from_sds(dataset, chunks=CHUNK_SIZE)
chunks = self._chunks_for_variable(dataset)
dask_arr = from_sds(dataset, chunks=chunks)
dims = ('y', 'x') if dask_arr.ndim == 2 else None
data = xr.DataArray(dask_arr, dims=dims,
attrs=dataset.attributes())
data = self._scale_and_mask_data_array(data, is_category=is_category)

return data

def _chunks_for_variable(self, hdf_dataset):
scan_length_250m = 40
var_shape = hdf_dataset.info()[2]
res_multiplier = self._get_res_multiplier(var_shape)
num_nonyx_dims = len(var_shape) - 2
return normalize_low_res_chunks(
(1,) * num_nonyx_dims + ("auto", -1),
var_shape,
(1,) * num_nonyx_dims + (scan_length_250m, -1),
(1,) * num_nonyx_dims + (res_multiplier, res_multiplier),
np.float32,
)

@staticmethod
def _get_res_multiplier(var_shape):
num_columns_to_multiplier = {
271: 20, # 5km
1354: 4, # 1km
2708: 2, # 500m
5416: 1, # 250m
}
for max_columns, res_multiplier in num_columns_to_multiplier.items():
if var_shape[-1] <= max_columns:
return res_multiplier
return 1

def _scale_and_mask_data_array(self, data, is_category=False):
"""Unscale byte data and mask invalid/fill values.
Expand Down
8 changes: 4 additions & 4 deletions satpy/readers/modis_l1b.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,8 @@

from satpy.readers.hdf4_utils import from_sds
from satpy.readers.hdfeos_base import HDFEOSBaseFileReader, HDFEOSGeoReader
from satpy.utils import get_legacy_chunk_size

logger = logging.getLogger(__name__)
CHUNK_SIZE = get_legacy_chunk_size()


class HDFEOSBandReader(HDFEOSBaseFileReader):
Expand Down Expand Up @@ -118,7 +116,8 @@ def get_dataset(self, key, info):
subdata = self.sd.select(var_name)
var_attrs = subdata.attributes()
uncertainty = self.sd.select(var_name + "_Uncert_Indexes")
array = xr.DataArray(from_sds(subdata, chunks=CHUNK_SIZE)[band_index, :, :],
chunks = self._chunks_for_variable(subdata)
array = xr.DataArray(from_sds(subdata, chunks=chunks)[band_index, :, :],
dims=['y', 'x']).astype(np.float32)
valid_range = var_attrs['valid_range']
valid_min = np.float32(valid_range[0])
Expand Down Expand Up @@ -214,7 +213,8 @@ def _mask_invalid(self, array, valid_min, valid_max):
def _mask_uncertain_pixels(self, array, uncertainty, band_index):
if not self._mask_saturated:
return array
band_uncertainty = from_sds(uncertainty, chunks=CHUNK_SIZE)[band_index, :, :]
uncertainty_chunks = self._chunks_for_variable(uncertainty)
band_uncertainty = from_sds(uncertainty, chunks=uncertainty_chunks)[band_index, :, :]
array = array.where(band_uncertainty < 15)
return array

Expand Down
9 changes: 1 addition & 8 deletions satpy/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
import os
import warnings
from logging import getLogger
from math import lcm # type: ignore
from weakref import WeakValueDictionary

import dask
Expand All @@ -157,14 +158,6 @@

from satpy.utils import PerformanceWarning, get_legacy_chunk_size

try:
from math import lcm # type: ignore
except ImportError:
def lcm(a, b):
"""Get 'Least Common Multiple' with Python 3.8 compatibility."""
from math import gcd
return abs(a * b) // gcd(a, b)

try:
from pyresample.resampler import BaseResampler as PRBaseResampler
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion satpy/tests/reader_tests/modis_tests/_modis_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _shape_for_resolution(resolution: int) -> tuple[int, int]:
return factor * shape_1km[0], factor * shape_1km[1]


def _generate_lonlat_data(resolution: int) -> np.ndarray:
def _generate_lonlat_data(resolution: int) -> tuple[np.ndarray, np.ndarray]:
shape = _shape_for_resolution(resolution)
lat = np.repeat(np.linspace(35., 45., shape[0])[:, None], shape[1], 1)
lat *= np.linspace(0.9, 1.1, shape[1])
Expand Down
24 changes: 20 additions & 4 deletions satpy/tests/reader_tests/modis_tests/test_modis_l1b.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ def _check_shared_metadata(data_arr):
assert "rows_per_scan" in data_arr.attrs
assert isinstance(data_arr.attrs["rows_per_scan"], int)
assert data_arr.attrs['reader'] == 'modis_l1b'
assert "resolution" in data_arr.attrs
res = data_arr.attrs["resolution"]
if res == 5000:
assert data_arr.chunks == ((2, 2, 2), (data_arr.shape[1],))
elif res == 1000:
assert data_arr.chunks == ((10, 10, 10), (data_arr.shape[1],))
elif res == 500:
assert data_arr.chunks == ((20, 20, 20), (data_arr.shape[1],))
elif res == 250:
assert data_arr.chunks == ((40, 40, 40), (data_arr.shape[1],))
else:
raise ValueError(f"Unexpected resolution: {res}")


def _load_and_check_geolocation(scene, resolution, exp_res, exp_shape, has_res,
Expand Down Expand Up @@ -147,7 +159,8 @@ def test_load_longitude_latitude(self, input_files, has_5km, has_500, has_250, d
shape_500m = _shape_for_resolution(500)
shape_250m = _shape_for_resolution(250)
default_shape = _shape_for_resolution(default_res)
with dask.config.set(scheduler=CustomScheduler(max_computes=1 + has_5km + has_500 + has_250)):
scheduler = CustomScheduler(max_computes=1 + has_5km + has_500 + has_250)
with dask.config.set({'scheduler': scheduler, 'array.chunk-size': '1 MiB'}):
_load_and_check_geolocation(scene, "*", default_res, default_shape, True)
_load_and_check_geolocation(scene, 5000, 5000, shape_5km, has_5km)
_load_and_check_geolocation(scene, 500, 500, shape_500m, has_500)
Expand All @@ -157,7 +170,8 @@ def test_load_sat_zenith_angle(self, modis_l1b_nasa_mod021km_file):
"""Test loading satellite zenith angle band."""
scene = Scene(reader='modis_l1b', filenames=modis_l1b_nasa_mod021km_file)
dataset_name = 'satellite_zenith_angle'
scene.load([dataset_name])
with dask.config.set({'array.chunk-size': '1 MiB'}):
scene.load([dataset_name])
dataset = scene[dataset_name]
assert dataset.shape == _shape_for_resolution(1000)
assert dataset.attrs['resolution'] == 1000
Expand All @@ -167,7 +181,8 @@ def test_load_vis(self, modis_l1b_nasa_mod021km_file):
"""Test loading visible band."""
scene = Scene(reader='modis_l1b', filenames=modis_l1b_nasa_mod021km_file)
dataset_name = '1'
scene.load([dataset_name])
with dask.config.set({'array.chunk-size': '1 MiB'}):
scene.load([dataset_name])
dataset = scene[dataset_name]
assert dataset[0, 0] == 300.0
assert dataset.shape == _shape_for_resolution(1000)
Expand All @@ -180,7 +195,8 @@ def test_load_vis_saturation(self, mask_saturated, modis_l1b_nasa_mod021km_file)
scene = Scene(reader='modis_l1b', filenames=modis_l1b_nasa_mod021km_file,
reader_kwargs={"mask_saturated": mask_saturated})
dataset_name = '2'
scene.load([dataset_name])
with dask.config.set({'array.chunk-size': '1 MiB'}):
scene.load([dataset_name])
dataset = scene[dataset_name]
assert dataset.shape == _shape_for_resolution(1000)
assert dataset.attrs['resolution'] == 1000
Expand Down
48 changes: 0 additions & 48 deletions satpy/tests/test_compat.py

This file was deleted.

0 comments on commit 8d68425

Please sign in to comment.