Skip to content

Commit

Permalink
refactor: from_rdataframe to use LayoutBuilder (#1620)
Browse files Browse the repository at this point in the history
* Extend LayoutBuilder API to support user-allocated char buffers
* Replace static_cast with reinterpret_cast in GrowableBuffer and LayoutBuilders
* Cleanup some warnings
* Replace from_rdataframe data accumulation to std::vectors with LayoutBuilders
* Add tests
  • Loading branch information
ianna committed Aug 24, 2022
1 parent 2272fbe commit 9e8c841
Show file tree
Hide file tree
Showing 7 changed files with 570 additions and 212 deletions.
95 changes: 54 additions & 41 deletions src/awkward/_v2/_connect/rdataframe/from_rdataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

done = compiler(
"""
#include "rdataframe_jagged_builders.h"
#include "rdataframe/jagged_builders.h"
"""
)
assert done is True
Expand Down Expand Up @@ -66,8 +66,12 @@ def _wrap_as_record_array(array):
if form_str.startswith("{"):
form = ak._v2.forms.from_json(form_str)
list_depth = form.purelist_depth
if list_depth > 3:
raise ak._v2._util.error(NotImplementedError)
if list_depth > 4:
raise ak._v2._util.error(
NotImplementedError(
"Retrieving arbitrary depth nested containers is not implemented yet."
)
)

def supported(form):
if form.purelist_depth == 1:
Expand All @@ -94,61 +98,70 @@ def form_dtype(form):
else:
return form_dtype(form.content)

dtype = form_dtype(form)
buffers = {}
depths = 0
offsets_length = 0
def empty_buffers(cpp_buffers_self, names_nbytes):
buffers = {}
for item in names_nbytes:
buffers[item.first] = ak.nplike.numpy.empty(item.second)
cpp_buffers_self.append(
item.first,
buffers[item.first].ctypes.data_as(ctypes.POINTER(ctypes.c_ubyte)),
)
return buffers

data_type = cpp_type_of[form_dtype(form).name]

# pull in the CppBuffers (after which we can import from it)
CppBuffers = cppyy.gbl.awkward.CppBuffers[column_type, cpp_type_of[dtype.name]]
CppBuffers = cppyy.gbl.awkward.CppBuffers[column_type]
cpp_buffers_self = CppBuffers(result_ptrs)

if isinstance(form, ak._v2.forms.NumpyForm):
distance = CppBuffers.result_distance(cpp_buffers_self)
data = ak.nplike.numpy.empty(distance, dtype)
CppBuffers.fill_data_array(
cpp_buffers_self, data.ctypes.data_as(ctypes.c_void_p)
)
layout = ak._v2.contents.numpyarray.NumpyArray(
data,
parameters=form.parameters,
)

return _wrap_as_record_array(layout)
NumpyBuilder = cppyy.gbl.awkward.LayoutBuilder.Numpy[data_type]
builder = NumpyBuilder()
builder_type = type(builder).__cpp_name__

cpp_buffers_self.fill_from[builder_type](builder)

elif isinstance(form, ak._v2.forms.ListOffsetForm) and isinstance(
form.content, ak._v2.forms.NumpyForm
):
# list_depth == 2 or 1 if its the list of strings
# copy data from RDF and make nested offsets
depths, offsets_length = CppBuffers.offsets_and_flatten_2(cpp_buffers_self)
# NOTE: list_depth == 2 or 1 if its the list of strings
ListOffsetBuilder = cppyy.gbl.awkward.LayoutBuilder.ListOffset[
"int64_t",
f"awkward::LayoutBuilder::Numpy<{data_type}",
]
builder = ListOffsetBuilder()
builder_type = type(builder).__cpp_name__

cpp_buffers_self.fill_offsets_and_flatten_2[builder_type](builder)

elif list_depth == 3:
depths, offsets_length = CppBuffers.offsets_and_flatten_3(cpp_buffers_self)
ListOffsetBuilder = cppyy.gbl.awkward.LayoutBuilder.ListOffset[
"int64_t",
f"awkward::LayoutBuilder::ListOffset<int64_t, awkward::LayoutBuilder::Numpy<{data_type}>",
]
builder = ListOffsetBuilder()
builder_type = type(builder).__cpp_name__

cpp_buffers_self.fill_offsets_and_flatten_3[builder_type](builder)

else:
depths, offsets_length = CppBuffers.offsets_and_flatten_4(cpp_buffers_self)

for depth in range(depths):
length = CppBuffers.offsets_length(cpp_buffers_self, depth)
offsets = ak.nplike.numpy.empty(length, np.int64)
CppBuffers.copy_offsets(
cpp_buffers_self,
offsets.ctypes.data_as(ctypes.c_void_p),
length,
depth,
)
buffers[f"node{depth}-offsets"] = offsets
ListOffsetBuilder = cppyy.gbl.awkward.LayoutBuilder.ListOffset[
"int64_t",
f"awkward::LayoutBuilder::ListOffset<int64_t, awkward::LayoutBuilder::ListOffset<int64_t, awkward::LayoutBuilder::Numpy<{data_type}>>",
]
builder = ListOffsetBuilder()
builder_type = type(builder).__cpp_name__

data_length = CppBuffers.data_length(cpp_buffers_self)
data = ak.nplike.numpy.empty(data_length, dtype=dtype)
CppBuffers.copy_data(
cpp_buffers_self, data.ctypes.data_as(ctypes.c_void_p), data_length
)
buffers[f"node{depths}-data"] = data
cpp_buffers_self.fill_offsets_and_flatten_4[builder_type](builder)

names_nbytes = cpp_buffers_self.names_nbytes[builder_type](builder)
buffers = empty_buffers(cpp_buffers_self, names_nbytes)
cpp_buffers_self.to_char_buffers[builder_type, data_type](builder)

array = ak._v2.from_buffers(
form,
offsets_length - 1,
builder.length(),
buffers,
)
return _wrap_as_record_array(array)
Expand Down
6 changes: 3 additions & 3 deletions src/awkward/_v2/cpp-headers/awkward/GrowableBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,9 @@ namespace awkward {
/// Although the #length increments every time #append is called,
/// it is always less than or equal to #reserved because of
/// allocations of new panels.
int64_t
size_t
length() const {
return length_ + (int64_t)ptr_->current_length();
return length_ + ptr_->current_length();
}

/// @brief Return options of this GrowableBuffer.
Expand Down Expand Up @@ -465,7 +465,7 @@ namespace awkward {
const BuilderOptions options_;

/// @brief Filled panels data length.
int64_t length_;
size_t length_;

/// @brief The first panel.
std::unique_ptr<Panel<PRIMITIVE>> panel_;
Expand Down
Loading

0 comments on commit 9e8c841

Please sign in to comment.