From 0faba22661555516180963225c392770b60faafb Mon Sep 17 00:00:00 2001 From: Aaron Finke Date: Sun, 16 Mar 2025 09:24:00 +0100 Subject: [PATCH 1/3] add bitshuffle support, compress binned datasets using bitshuffle/LZ4 --- pyproject.toml | 1 + src/ess/nmx/nexus.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fd9c800..b1fda2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "pandas", "gemmi", "defusedxml", + "bitshuffle", ] dynamic = ["version"] diff --git a/src/ess/nmx/nexus.py b/src/ess/nmx/nexus.py index b241893..ef2144f 100644 --- a/src/ess/nmx/nexus.py +++ b/src/ess/nmx/nexus.py @@ -7,6 +7,7 @@ from functools import partial from typing import Any, TypeVar +import bitshuffle.h5 import h5py import numpy as np import sciline as sl @@ -34,6 +35,7 @@ def _create_dataset_from_var( long_name: str | None = None, compression: str | None = None, compression_opts: int | None = None, + chunks: tuple[int] | int | bool | None = None, dtype: Any = None, ) -> h5py.Dataset: compression_options = {} @@ -45,6 +47,7 @@ def _create_dataset_from_var( dataset = root_entry.create_dataset( name, data=var.values if dtype is None else var.values.astype(dtype, copy=False), + chunks = chunks, **compression_options, ) if var.unit is not None: @@ -56,8 +59,8 @@ def _create_dataset_from_var( _create_compressed_dataset = partial( _create_dataset_from_var, - compression="gzip", - compression_opts=4, + compression=bitshuffle.h5.H5FILTER, + compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4), ) @@ -424,10 +427,11 @@ def _export_reduced_data_as_nxlauetof( # The actual application definition defines it as integer, # but we keep the original data type for now num_x, num_y = dg["detector_shape"].value # Probably better way to do this - data_dset = _create_dataset_from_var( + data_dset = _create_compressed_dataset( name="data", root_entry=nx_detector, var=sc.fold(dg['counts'].data, dim='id', sizes={'x': num_x, 'y': num_y}), + chunks=(num_x,num_y,1), dtype=np.uint, ) data_dset.attrs["signal"] = 1 From fa376672bfa6a1fdb1826c995aeb07e3ffaf906b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci-lite[bot]" <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Date: Sun, 16 Mar 2025 08:25:26 +0000 Subject: [PATCH 2/3] Apply automatic formatting --- src/ess/nmx/nexus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ess/nmx/nexus.py b/src/ess/nmx/nexus.py index ef2144f..58a3e9e 100644 --- a/src/ess/nmx/nexus.py +++ b/src/ess/nmx/nexus.py @@ -47,7 +47,7 @@ def _create_dataset_from_var( dataset = root_entry.create_dataset( name, data=var.values if dtype is None else var.values.astype(dtype, copy=False), - chunks = chunks, + chunks=chunks, **compression_options, ) if var.unit is not None: @@ -431,7 +431,7 @@ def _export_reduced_data_as_nxlauetof( name="data", root_entry=nx_detector, var=sc.fold(dg['counts'].data, dim='id', sizes={'x': num_x, 'y': num_y}), - chunks=(num_x,num_y,1), + chunks=(num_x, num_y, 1), dtype=np.uint, ) data_dset.attrs["signal"] = 1 From eca7f9b6a24848e4bb2d24ce9c80c3b8efae5c35 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Sun, 16 Mar 2025 18:37:22 +0100 Subject: [PATCH 3/3] Add docstring and option. --- src/ess/nmx/nexus.py | 50 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/ess/nmx/nexus.py b/src/ess/nmx/nexus.py index 58a3e9e..0e65eeb 100644 --- a/src/ess/nmx/nexus.py +++ b/src/ess/nmx/nexus.py @@ -35,7 +35,7 @@ def _create_dataset_from_var( long_name: str | None = None, compression: str | None = None, compression_opts: int | None = None, - chunks: tuple[int] | int | bool | None = None, + chunks: tuple[int, ...] | int | bool | None = None, dtype: Any = None, ) -> h5py.Dataset: compression_options = {} @@ -62,6 +62,17 @@ def _create_dataset_from_var( compression=bitshuffle.h5.H5FILTER, compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4), ) +"""Create dataset with compression options. + +[``Bitshuffle/LZ4``](https://github.com/kiyo-masui/bitshuffle) is used for convenience. +Since ``Dectris`` uses it for their Nexus file compression, it is compatible with DIALS. +``Bitshuffle/LZ4`` tends to give similar results to +GZIP and other compression algorithms with better performance. +A naive implementation of bitshuffle/LZ4 compression, +shown in [issue #124](https://github.com/scipp/essnmx/issues/124), +led to 80% file reduction (365 MB vs 1.8 GB). + +""" def _create_root_data_entry(file_obj: h5py.File) -> h5py.Group: @@ -396,7 +407,9 @@ def _export_detector_metadata_as_nxlauetof( def _export_reduced_data_as_nxlauetof( dg: NMXReducedDataGroup, output_file: str | pathlib.Path | io.BytesIO, + *, append_mode: bool = True, + compress_counts: bool = True, ) -> None: """Export the reduced data to a NeXus file with the LAUE_TOF application definition. @@ -416,6 +429,9 @@ def _export_reduced_data_as_nxlauetof( If ``False``, the file is opened in None-append mode. > None-append mode is not supported for now. > Only append mode is supported for now. + compress_counts: + If ``True``, the detector counts are compressed using bitshuffle. + It is because only the detector counts are expected to be large. """ if not append_mode: @@ -427,13 +443,25 @@ def _export_reduced_data_as_nxlauetof( # The actual application definition defines it as integer, # but we keep the original data type for now num_x, num_y = dg["detector_shape"].value # Probably better way to do this - data_dset = _create_compressed_dataset( - name="data", - root_entry=nx_detector, - var=sc.fold(dg['counts'].data, dim='id', sizes={'x': num_x, 'y': num_y}), - chunks=(num_x, num_y, 1), - dtype=np.uint, - ) + if compress_counts: + data_dset = _create_compressed_dataset( + name="data", + root_entry=nx_detector, + var=sc.fold( + dg['counts'].data, dim='id', sizes={'x': num_x, 'y': num_y} + ), + chunks=(num_x, num_y, 1), + dtype=np.uint, + ) + else: + data_dset = _create_dataset_from_var( + name="data", + root_entry=nx_detector, + var=sc.fold( + dg['counts'].data, dim='id', sizes={'x': num_x, 'y': num_y} + ), + dtype=np.uint, + ) data_dset.attrs["signal"] = 1 _create_dataset_from_var( name='time_of_flight', @@ -466,12 +494,14 @@ def __init__( chunk_generator: Callable[[FilePath, DetectorName], Generator[T, None, None]], chunk_insert_key: type[T], extra_meta: dict[str, sc.Variable] | None = None, + compress_counts: bool = True, overwrite: bool = False, ) -> None: from ess.reduce.streaming import EternalAccumulator, StreamProcessor from .types import FilePath, NMXReducedCounts + self.compress_counts = compress_counts self._chunk_generator = chunk_generator self._chunk_insert_key = chunk_insert_key self._workflow = workflow @@ -520,6 +550,8 @@ def add_panel(self, *, detector_id: DetectorIndex | DetectorName) -> None: results = processor.add_chunk({self._chunk_insert_key: da}) _export_reduced_data_as_nxlauetof( - results[NMXReducedDataGroup], self._output_filename + results[NMXReducedDataGroup], + self._output_filename, + compress_counts=self.compress_counts, ) return results[NMXReducedDataGroup]