Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set up interface between Uproot and Awkward so that Awkward can be used to optimize object-reading. #96

Merged
merged 7 commits into from Sep 12, 2020
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -57,3 +57,4 @@ The full list is
* `backports.lzma`: only if reading ROOT files that have been LZMA-compressed (in Python 2).
* `boost-histogram`: only if converting histograms to Boost with `.to_boost()`.
* `hist`: only if converting histograms to hist with `.to_hist()`.

12 changes: 8 additions & 4 deletions uproot4/behaviors/TBranch.py
Expand Up @@ -645,10 +645,15 @@ def real_filter_branch(branch):
names = []
for key in common_keys:
branch = obj[key]

interpretation = branchid_interpretation[branch.cache_key]
form = interpretation.awkward_form(obj.file, index_format="i64")
if isinstance(interpretation, uproot4.interpretation.objects.AsObjects):
form = uproot4._util.awkward_form_of_iter(awkward1, form)
if not uproot4.interpretation.objects.awkward_can_optimize(
interpretation, form
):
form = uproot4._util.awkward_form_of_iter(awkward1, form)

generator = awkward1.layout.ArrayGenerator(
branch.array,
(
Expand Down Expand Up @@ -825,9 +830,7 @@ def show(
if len(interp) > interpretation_width:
interp = interp[: interpretation_width - 3] + "..."

stream.write(
formatter.format(name, typename, interp).rstrip(" ") + "\n"
)
stream.write(formatter.format(name, typename, interp).rstrip(" ") + "\n")

def arrays(
self,
Expand Down Expand Up @@ -3059,6 +3062,7 @@ def basket_to_array(basket):
branch,
branch.context,
basket.member("fKeylen"),
library,
)
if basket.num_entries != len(basket_arrays[basket.basket_num]):
raise ValueError(
Expand Down
6 changes: 5 additions & 1 deletion uproot4/interpretation/__init__.py
Expand Up @@ -78,7 +78,9 @@ def awkward_form(self, file, index_format="i64", header=False, tobject_header=Tr
"""
raise AssertionError

def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offset):
def basket_array(
self, data, byte_offsets, basket, branch, context, cursor_offset, library
):
"""
Args:
data (array of ``numpy.uint8``): Raw but uncompressed data from the
Expand All @@ -94,6 +96,8 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
:doc:`uproot4.source.cursor.Cursor.refs` for objects
deserialized by reference
(:doc:`uproot4.deserialization.read_object_any`).
library (:doc:`uproot4.interpretation.library.Library`): The
requested library for output.

Performs the first step of interpretation, from uncompressed ``TBasket``
data to a temporary array.
Expand Down
4 changes: 3 additions & 1 deletion uproot4/interpretation/grouped.py
Expand Up @@ -95,7 +95,9 @@ def awkward_form(self, file, index_format="i64", header=False, tobject_header=Tr

return awkward1.forms.RecordForm(fields, names)

def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offset):
def basket_array(
self, data, byte_offsets, basket, branch, context, cursor_offset, library
):
raise ValueError(
"""grouping branches like {0} should not be read directly; instead read the subbranches:

Expand Down
10 changes: 7 additions & 3 deletions uproot4/interpretation/jagged.py
Expand Up @@ -134,14 +134,17 @@ def typename(self):
else:
return self._typename

def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offset):
def basket_array(
self, data, byte_offsets, basket, branch, context, cursor_offset, library
):
self.hook_before_basket_array(
data=data,
byte_offsets=byte_offsets,
basket=basket,
branch=branch,
context=context,
cursor_offset=cursor_offset,
library=library,
)

if byte_offsets is None:
Expand All @@ -164,7 +167,7 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
if self._header_bytes == 0:
offsets = fast_divide(byte_offsets, self._content.itemsize)
content = self._content.basket_array(
data, None, basket, branch, context, cursor_offset
data, None, basket, branch, context, cursor_offset, library
)
output = JaggedArray(offsets, content)

Expand All @@ -180,7 +183,7 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
data = data[mask]

content = self._content.basket_array(
data, None, basket, branch, context, cursor_offset
data, None, basket, branch, context, cursor_offset, library
)

byte_counts = byte_stops - byte_starts
Expand All @@ -199,6 +202,7 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
context=context,
output=output,
cursor_offset=cursor_offset,
library=library,
)

return output
Expand Down
5 changes: 4 additions & 1 deletion uproot4/interpretation/library.py
Expand Up @@ -397,7 +397,10 @@ def imported(self):
def finalize(self, array, branch, interpretation, entry_start, entry_stop):
awkward1 = self.imported

if isinstance(array, uproot4.interpretation.objects.StridedObjectArray):
if isinstance(array, awkward1.layout.Content):
return awkward1.Array(array)

elif isinstance(array, uproot4.interpretation.objects.StridedObjectArray):
return awkward1.Array(
_strided_to_awkward(awkward1, "", array.interpretation, array.array)
)
Expand Down
12 changes: 10 additions & 2 deletions uproot4/interpretation/numerical.py
Expand Up @@ -302,14 +302,17 @@ def form(dtype):
+ "}"
)

def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offset):
def basket_array(
self, data, byte_offsets, basket, branch, context, cursor_offset, library
):
self.hook_before_basket_array(
data=data,
byte_offsets=byte_offsets,
basket=basket,
branch=branch,
context=context,
cursor_offset=cursor_offset,
library=library,
)

dtype, shape = _dtype_shape(self._from_dtype)
Expand All @@ -336,6 +339,7 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
context=context,
output=output,
cursor_offset=cursor_offset,
library=library,
)

return output
Expand Down Expand Up @@ -459,14 +463,17 @@ def cache_key(self):
type(self).__name__, self._low, self._high, self._num_bits, self._to_dims
)

def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offset):
def basket_array(
self, data, byte_offsets, basket, branch, context, cursor_offset, library
):
self.hook_before_basket_array(
data=data,
byte_offsets=byte_offsets,
basket=basket,
branch=branch,
context=context,
cursor_offset=cursor_offset,
library=library,
)

try:
Expand Down Expand Up @@ -515,6 +522,7 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
branch=branch,
context=context,
cursor_offset=cursor_offset,
library=library,
raw=raw,
output=output,
)
Expand Down
95 changes: 71 additions & 24 deletions uproot4/interpretation/objects.py
Expand Up @@ -35,6 +35,23 @@
import uproot4._util


def awkward_can_optimize(interpretation, form):
"""
If True, the Awkward Array library can convert data of a given
:doc:`uproot4.interpretation.Interpretation` and ``ak.forms.Form`` into
arrays without resorting to ``ak.from_iter`` (i.e. rapidly).

If ``awkward1._connect._uproot`` cannot be imported, this function always
returns False.
"""
try:
import awkward1._connect._uproot
except ImportError:
return False
else:
return awkward1._connect._uproot.can_optimize(interpretation, form)


class AsObjects(uproot4.interpretation.Interpretation):
"""
Args:
Expand Down Expand Up @@ -107,21 +124,42 @@ def awkward_form(self, file, index_format="i64", header=False, tobject_header=Tr
self._branch.file, index_format, header, tobject_header
)

def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offset):
def basket_array(
self, data, byte_offsets, basket, branch, context, cursor_offset, library
):
self.hook_before_basket_array(
data=data,
byte_offsets=byte_offsets,
basket=basket,
branch=branch,
context=context,
cursor_offset=cursor_offset,
library=library,
)

assert basket.byte_offsets is not None

output = ObjectArray(
self._model, branch, context, byte_offsets, data, cursor_offset
)
output = None
if isinstance(library, uproot4.interpretation.library.Awkward):
form = self.awkward_form(branch.file, index_format="i64")

if awkward_can_optimize(self, form):
import awkward1._connect._uproot

extra = {
"interpretation": self,
"basket": basket,
"branch": branch,
"context": context,
"cursor_offset": cursor_offset,
}
output = awkward1._connect._uproot.basket_array(
form, data, byte_offsets, extra
)

if output is None:
output = ObjectArray(
self._model, branch, context, byte_offsets, data, cursor_offset
).to_numpy()

self.hook_after_basket_array(
data=data,
Expand All @@ -131,6 +169,7 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
context=context,
output=output,
cursor_offset=cursor_offset,
library=library,
)

return output
Expand All @@ -146,37 +185,36 @@ def final_array(
library=library,
branch=branch,
)

output = numpy.empty(entry_stop - entry_start, dtype=numpy.dtype(numpy.object))

trimmed = []
start = entry_offsets[0]
for basket_num, stop in enumerate(entry_offsets[1:]):
if start <= entry_start and entry_stop <= stop:
basket_array = basket_arrays[basket_num]
for global_i in uproot4._util.range(entry_start, entry_stop):
local_i = global_i - start
output[global_i - entry_start] = basket_array[local_i]
local_start = entry_start - start
local_stop = entry_stop - start
trimmed.append(basket_arrays[basket_num][local_start:local_stop])

elif start <= entry_start < stop:
basket_array = basket_arrays[basket_num]
for global_i in uproot4._util.range(entry_start, stop):
local_i = global_i - start
output[global_i - entry_start] = basket_array[local_i]
local_start = entry_start - start
local_stop = stop - start
trimmed.append(basket_arrays[basket_num][local_start:local_stop])

elif start <= entry_stop <= stop:
basket_array = basket_arrays[basket_num]
for global_i in uproot4._util.range(start, entry_stop):
local_i = global_i - start
output[global_i - entry_start] = basket_array[local_i]
local_start = 0
local_stop = entry_stop - start
trimmed.append(basket_arrays[basket_num][local_start:local_stop])

elif entry_start < stop and start <= entry_stop:
basket_array = basket_arrays[basket_num]
for global_i in uproot4._util.range(start, stop):
local_i = global_i - start
output[global_i - entry_start] = basket_array[local_i]
trimmed.append(basket_arrays[basket_num])

start = stop

if all(type(x).__module__.startswith("awkward1") for x in basket_arrays.values()):
assert isinstance(library, uproot4.interpretation.library.Awkward)
awkward1 = library.imported
output = awkward1.concatenate(trimmed, mergebool=False, highlevel=False)
else:
output = numpy.concatenate(trimmed)

self.hook_before_library_finalize(
basket_arrays=basket_arrays,
entry_start=entry_start,
Expand Down Expand Up @@ -514,6 +552,15 @@ def cursor_offset(self):
"""
return self._cursor_offset

def to_numpy(self):
"""
Convert this ObjectArray into a NumPy ``dtype="O"`` (object) array.
"""
output = numpy.empty(len(self), dtype=numpy.dtype(numpy.object))
for i in range(len(self)):
output[i] = self[i]
return output

def __len__(self):
return len(self._byte_offsets) - 1

Expand Down
6 changes: 5 additions & 1 deletion uproot4/interpretation/strings.py
Expand Up @@ -143,14 +143,17 @@ def cache_key(self):
type(self).__name__, self._header_bytes, repr(self._length_bytes)
)

def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offset):
def basket_array(
self, data, byte_offsets, basket, branch, context, cursor_offset, library
):
self.hook_before_basket_array(
data=data,
byte_offsets=byte_offsets,
basket=basket,
branch=branch,
context=context,
cursor_offset=cursor_offset,
library=library,
)

if byte_offsets is None:
Expand Down Expand Up @@ -233,6 +236,7 @@ def basket_array(self, data, byte_offsets, basket, branch, context, cursor_offse
context=context,
output=output,
cursor_offset=cursor_offset,
library=library,
)

return output
Expand Down