Skip to content

Commit

Permalink
Support reading/writing compressed files (#44)
Browse files Browse the repository at this point in the history
pyiostream is added, which allows C++ code to interact with an iostream
that read/writes to a Python file object. The primary use-case is to
have transparent compression based on the gzip, bzip2, lzma modules in
Python's standard library.
  • Loading branch information
HDembinski committed Sep 11, 2022
1 parent 7560ffc commit dcf0031
Show file tree
Hide file tree
Showing 9 changed files with 415 additions and 53 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/test.yml
Expand Up @@ -37,4 +37,6 @@ jobs:
python-version: ${{ matrix.python-version }}
- run: python -m pip install --upgrade pip wheel
- run: python -m pip install --prefer-binary -v -e .[test]
- run: python -m pytest
env:
DEBUG: 1
- run: python -m pytest -vv
38 changes: 38 additions & 0 deletions bench/test_io.py
@@ -0,0 +1,38 @@
import pyhepmc
from pyhepmc.io import ReaderAscii
from pyhepmc._core import pyiostream
from pathlib import Path


fn = str(Path(__file__).parent.parent / "tests" / "pythia6.dat")

with pyhepmc.open(fn) as f:
evt = f.read()

with pyhepmc.open("bench.dat", "w") as f:
for _ in range(4000):
f.write(evt)


def test_ReaderAscii(benchmark):
def run():
with ReaderAscii("bench.dat") as r:
while True:
evt = r.read()
if evt is None:
break

benchmark(run)


def test_ReaderAscii_pyiostream(benchmark):
def run():
with open("bench.dat", "rb") as f:
pis = pyiostream(f, 4096)
with ReaderAscii(pis) as r:
while True:
evt = r.read()
if evt is None:
break

benchmark(run)
39 changes: 25 additions & 14 deletions src/io.cpp
@@ -1,5 +1,6 @@
#include "UnparsedAttribute.hpp"
#include "pybind.hpp"
#include "pyiostream.hpp"
#include "repr.hpp"
#include <HepMC3/GenRunInfo.h>
#include <HepMC3/Reader.h>
Expand All @@ -10,9 +11,10 @@
#include <HepMC3/WriterAscii.h>
#include <HepMC3/WriterAsciiHepMC2.h>
#include <HepMC3/WriterHEPEVT.h>
#include <iostream>
#include <map>
#include <memory>
#include <sstream>
#include <pybind11/attr.h>
#include <string>

using namespace HepMC3;
Expand All @@ -33,13 +35,21 @@ void register_io(py::module& m) {
py::module_ m_doc = py::module_::import("pyhepmc._doc");
auto doc = py::cast<std::map<std::string, std::string>>(m_doc.attr("doc"));

py::class_<std::iostream>(m, "iostream")
// clang-format off
METH(flush, pyiostream)
// clang-format on
;

py::class_<pyiostream, std::iostream>(m, "pyiostream")
.def(py::init<py::object, int>(), "file_object"_a, "buffer_size"_a = 4096);

// this class is here to simplify unit testing of Readers and Writers
py::class_<std::stringstream>(m, "stringstream")
py::class_<std::stringstream, std::iostream>(m, "stringstream")
.def(py::init<>())
.def(py::init<std::string>())
.def("__str__", (std::string(std::stringstream::*)() const) &
std::stringstream::str) METH(flush, std::stringstream)
METH(write, std::stringstream) METH(read, std::stringstream);
.def("__str__",
(std::string(std::stringstream::*)() const) & std::stringstream::str);

py::class_<Reader>(m, "Reader")
// clang-format off
Expand All @@ -52,19 +62,19 @@ void register_io(py::module& m) {

py::class_<ReaderAscii, Reader>(m, "ReaderAscii")
.def(py::init<const std::string>(), "filename"_a)
.def(py::init<std::stringstream&>());
.def(py::init<std::iostream&>(), "istream"_a, py::keep_alive<1, 2>());

py::class_<ReaderAsciiHepMC2, Reader>(m, "ReaderAsciiHepMC2")
.def(py::init<const std::string>(), "filename"_a)
.def(py::init<std::stringstream&>());
.def(py::init<std::iostream&>(), "istream"_a, py::keep_alive<1, 2>());

py::class_<ReaderLHEF, Reader>(m, "ReaderLHEF")
.def(py::init<const std::string>(), "filename"_a)
.def(py::init<std::stringstream&>());
.def(py::init<std::iostream&>(), "istream"_a, py::keep_alive<1, 2>());

py::class_<ReaderHEPEVT, Reader>(m, "ReaderHEPEVT")
.def(py::init<const std::string>(), "filename"_a)
.def(py::init<std::stringstream&>());
.def(py::init<std::iostream&>(), "istream"_a, py::keep_alive<1, 2>());

py::class_<Writer>(m, "Writer")
// clang-format off
Expand All @@ -78,8 +88,8 @@ void register_io(py::module& m) {
py::class_<WriterAscii, Writer>(m, "WriterAscii")
.def(py::init<const std::string&, GenRunInfoPtr>(), "filename"_a,
"run"_a = nullptr)
.def(py::init<std::stringstream&, GenRunInfoPtr>(), "ostringstream"_a,
"run"_a = nullptr, py::keep_alive<1, 2>())
.def(py::init<std::iostream&, GenRunInfoPtr>(), "ostream"_a, "run"_a = nullptr,
py::keep_alive<1, 2>())
// clang-format off
// not needed: METH(write_run_info, WriterAscii)
PROP(precision, WriterAscii)
Expand All @@ -89,16 +99,17 @@ void register_io(py::module& m) {
py::class_<WriterAsciiHepMC2, Writer>(m, "WriterAsciiHepMC2")
.def(py::init<const std::string&, GenRunInfoPtr>(), "filename"_a,
"run"_a = nullptr)
.def(py::init<std::stringstream&, GenRunInfoPtr>(), "ostringstream"_a,
"run"_a = nullptr)
.def(py::init<std::iostream&, GenRunInfoPtr>(), "ostream"_a, "run"_a = nullptr,
py::keep_alive<1, 2>())
// clang-format off
// not needed: METH(write_run_info, WriterAscii)
PROP(precision, WriterAsciiHepMC2)
// clang-format on
;

py::class_<WriterHEPEVT, Writer>(m, "WriterHEPEVT")
.def(py::init<const std::string&>(), "filename"_a);
.def(py::init<const std::string&>(), "filename"_a)
.def(py::init<std::iostream&>(), "ostream"_a, py::keep_alive<1, 2>());

py::class_<UnparsedAttribute>(m, "UnparsedAttribute", DOC(UnparsedAttribute))
.def("__str__", [](UnparsedAttribute& a) { return a.parent_->unparsed_string(); })
Expand Down
10 changes: 7 additions & 3 deletions src/pyhepmc/__init__.py
@@ -1,5 +1,5 @@
"""
pyhepmc: a pythonic and Jupyter-friendly Python API for HepMC3
pyhepmc is a pythonic and Jupyter-friendly Python API for HepMC3.
Differences between HepMC3 C++ and pyhepmc
------------------------------------------
Expand All @@ -9,7 +9,8 @@
- Sequences with matching types and lengths are implicitly
convertible to :class:`FourVector` und :class:`ToolInfo`.
- In addition to the C++ Reader/Writer classes, we offer an easy to use
:func:`open`. It can read any supported format and writes in HepMC3 format.
:func:`open`. It can read and write any supported HepMC3 format,
including compressed files (gzip, bzip2, lzma are supported).
- Attributes for :class:`GenRunInfo`, :class:`GenEvent`, :class:`GenParticle`,
:class:`GenVertex` can be accessed via a dict-like view returned by the
``attributes`` property. The view automatically converts between native C++
Expand All @@ -20,6 +21,8 @@
with two arguments.
- ``HEPEVT_Wrapper`` and friends are missing, use :meth:`GenEvent.from_hepevt`
instead.
- ``ReaderGZ`` and ``WriterGZ`` are missing, since :func:`open` offers this
functionality.
- API marked as deprecated in HepMC3 is not available in Python.
- pyhepmc offers event visualization and renders in Jupyter notebooks if all
required extra packages are installed, see :func:`pyhepmc.view.to_dot`.
Expand All @@ -28,7 +31,8 @@
---------------------
- Not yet implemented: ``GenParticleData``, ``GenVertexData``, ``ReaderMT``,
``ReaderGZ``, ``Setup``, ``WriterGZ``. These will be added in the future.
``Setup``. These will be added in the future.
"""
from sys import version_info
from ._core import ( # noqa: F401
Expand Down
96 changes: 70 additions & 26 deletions src/pyhepmc/io.py
Expand Up @@ -19,7 +19,9 @@
WriterAsciiHepMC2,
WriterHEPEVT,
UnparsedAttribute,
pyiostream,
)
import contextlib
from pathlib import PurePath
import typing as _tp

Expand Down Expand Up @@ -57,15 +59,20 @@ def __iter__(self) -> "_Iter":
next = __next__


def _enter(self: _Iter) -> _Iter:
def _enter(self: _tp.Any) -> _tp.Any:
return self


def _exit(self: _tp.Any, type: Exception, value: str, tb: _tp.Any) -> bool:
def _exit_close(self: _tp.Any, type: Exception, value: str, tb: _tp.Any) -> bool:
self.close()
return False


def _exit_flush(self: _tp.Any, type: Exception, value: str, tb: _tp.Any) -> bool:
self.flush()
return False


def _iter(self: _tp.Any) -> _Iter:
return _Iter(self)

Expand All @@ -87,52 +94,59 @@ def _read_event_lhef_patch(self: _tp.Any, evt: GenEvent) -> bool:
return not failed


# add pythonic interface to IO classes
# add contextmanager interface to IO classes
ReaderAscii.__enter__ = _enter
ReaderAscii.__exit__ = _exit
ReaderAscii.__exit__ = _exit_close
ReaderAscii.__iter__ = _iter
ReaderAscii.read = _read

ReaderAsciiHepMC2.__enter__ = _enter
ReaderAsciiHepMC2.__exit__ = _exit
ReaderAsciiHepMC2.__exit__ = _exit_close
ReaderAsciiHepMC2.__iter__ = _iter
ReaderAsciiHepMC2.read = _read

ReaderLHEF.__enter__ = _enter
ReaderLHEF.__exit__ = _exit
ReaderLHEF.__exit__ = _exit_close
ReaderLHEF.__iter__ = _iter
ReaderLHEF_read_event = ReaderLHEF.read_event
ReaderLHEF.read_event = _read_event_lhef_patch
ReaderLHEF.read = _read

ReaderHEPEVT.__enter__ = _enter
ReaderHEPEVT.__exit__ = _exit
ReaderHEPEVT.__exit__ = _exit_close
ReaderHEPEVT.__iter__ = _iter
ReaderHEPEVT.read = _read

WriterAscii.__enter__ = _enter
WriterAscii.__exit__ = _exit
WriterAscii.__exit__ = _exit_close
WriterAscii.write = WriterAscii.write_event

WriterAsciiHepMC2.__enter__ = _enter
WriterAsciiHepMC2.__exit__ = _exit
WriterAsciiHepMC2.__exit__ = _exit_close
WriterAsciiHepMC2.write = WriterAsciiHepMC2.write_event

WriterHEPEVT.__enter__ = _enter
WriterHEPEVT.__exit__ = _exit
WriterHEPEVT.__exit__ = _exit_close
WriterHEPEVT.write = WriterHEPEVT.write_event

pyiostream.__enter__ = _enter
pyiostream.__exit__ = _exit_flush


_Filename = _tp.Union[str, PurePath]


class _WrappedWriter:
# Wrapper for Writer, to be used by `open`

def __init__(
self, filename: _Filename, precision: _tp.Optional[int], Writer: _tp.Any
self,
iostream: _tp.Any,
precision: _tp.Optional[int],
Writer: _tp.Any,
):
self._writer: _tp.Any = None
self._init = (filename, precision, Writer)
self._init = (iostream, precision, Writer)

@staticmethod
def _maybe_convert(event: _tp.Any) -> GenEvent:
Expand All @@ -150,11 +164,11 @@ def write(self, event: _tp.Any) -> None:

if self._writer is None:
# first call
filename, precision, Writer = self._init
iostream, precision, Writer = self._init
if Writer is WriterHEPEVT:
self._writer = Writer(filename)
self._writer = Writer(iostream)
else:
self._writer = Writer(filename, evt.run_info)
self._writer = Writer(iostream, evt.run_info)
if precision is not None and hasattr(self._writer, "precision"):
self._writer.precision = precision

Expand All @@ -167,9 +181,10 @@ def close(self) -> None:
self._writer.close()

__enter__ = _enter
__exit__ = _exit
__exit__ = _exit_close


@contextlib.contextmanager
def open(
filename: _Filename,
mode: str = "r",
Expand All @@ -183,7 +198,8 @@ def open(
----------
filename : str or Path
Filename to open for reading or writing. When writing to existing files,
the contents are replaced.
the contents are replaced. When the filename ends with the suffixes ".gz",
".bz2", or ".xz", the contents are transparently compressed/decompressed.
mode : str, optional
Must be either "r" (default) or "w", to indicate whether to open for reading
or writing.
Expand All @@ -193,18 +209,39 @@ def open(
format : str or None, optional
Which format to use for reading or writing. If None (default), autodetect
format when reading (this is fast and thus safe to use), and use the latest
HepMC3 format when writing. Allowed values: "HepMC3", "HepMC2", "LHEF",
"HEPEVT". "LHEF" is not supported for writing.
HepMC3 format when writing. Allowed values (case-insensitive): "HepMC3",
"HepMC2", "LHEF", "HEPEVT". "LHEF" is not supported for writing.
Raises
------
IOError if reading or writing fails.
"""
if mode == "r":
fn = str(filename)

if fn.endswith(".gz"):
import gzip

op = gzip.open
elif fn.endswith(".bz2"):
import bz2

op = bz2.open # type:ignore

elif fn.endswith(".xz"):
import lzma

op = lzma.open # type:ignore
else:
op = _open # type:ignore
mode += "b"

if mode.startswith("r"):
if format is None:
# auto-detect
with _open(filename, "r") as f:
header = f.read(256)
with op(fn, mode) as f:
chunk = f.read(256)
assert isinstance(chunk, bytes)
header = chunk.decode()
if "HepMC::Asciiv3" in header:
Reader = ReaderAscii
elif "HepMC::IO_GenEvent" in header:
Expand All @@ -223,8 +260,11 @@ def open(
}.get(format.lower(), None)
if Reader is None:
raise ValueError(f"format {format} not recognized for reading")
return Reader(str(filename))
elif mode == "w":
with op(fn, mode) as f:
with pyiostream(f) as io:
with Reader(io) as r:
yield r
elif mode.startswith("w"):
if format is None:
Writer = WriterAscii
else:
Expand All @@ -235,5 +275,9 @@ def open(
}.get(format.lower(), None)
if Writer is None:
raise ValueError(f"format {format} not recognized for writing")
return _WrappedWriter(str(filename), precision, Writer)
raise ValueError("mode must be r or w")
with op(fn, mode) as f:
with pyiostream(f) as io:
with _WrappedWriter(io, precision, Writer) as w:
yield w
else:
raise ValueError(f"mode must be r or w, got {mode}")

0 comments on commit dcf0031

Please sign in to comment.