Skip to content

Commit

Permalink
Fix big bug in size estimation of array buffers
Browse files Browse the repository at this point in the history
An off-by-one bug in `estimated_bytes_size` caused the size estimations
of Tensors to completely ignore the actual payload, causing the memory
consumption of images to be vastly underestimated.

I decided to rewrite the code to be more clear for future readers.
  • Loading branch information
emilk committed Aug 15, 2023
1 parent 9c7b8d2 commit b0f91ba
Showing 1 changed file with 34 additions and 27 deletions.
61 changes: 34 additions & 27 deletions crates/re_types/src/size_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ fn validity_size(validity: Option<&Bitmap>) -> usize {
}

/// Returns the total (heap) allocated size of the array in bytes.
///
/// # Implementation
/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
/// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
Expand Down Expand Up @@ -347,36 +348,42 @@ fn estimated_bytes_size(array: &dyn Array) -> usize {
// The respective offsets for each child value array must be in
// order / increasing.

if offsets.is_empty() {
return 0;
/// The range of offsets for a given type id.
#[derive(Debug)]
struct Range {
/// Inclusive
min: i32,

/// Inclusive
max: i32,
}

let fields = array.fields();
let types: IntSet<_> = array.types().iter().copied().collect();
types
.into_iter()
.map(|cur_ty| {
let mut indices = array
.types()
.iter()
.enumerate()
.filter_map(|(idx, &ty)| (ty == cur_ty).then_some(idx));

let idx_start = indices.next().unwrap_or_default();
let mut idx_end = idx_start;
for idx in indices {
idx_end = idx;
}

let values_start = offsets[idx_start] as usize;
let values_end = offsets[idx_end] as usize;
fields.get(cur_ty as usize).map_or(0, |x| {
estimated_bytes_size(
x.sliced(values_start, values_end - values_start).as_ref(),
)
// The range of offsets for a given type id.
let mut type_ranges: BTreeMap<i8, Range> = Default::default();

for (&type_id, &offset) in array.types().iter().zip(offsets.iter()) {
// Offsets are monotonically increasing
type_ranges
.entry(type_id)
.and_modify(|range| {
range.max = offset;
})
})
.sum::<usize>()
.or_insert(Range {
min: offset,
max: offset,
});
}

let mut fields_size = 0;
for (type_id, range) in type_ranges {
if let Some(field) = array.fields().get(type_id as usize) {
let len = range.max - range.min + 1; // range is inclusive
fields_size += estimated_bytes_size(
field.sliced(range.min as usize, len as usize).as_ref(),
);
}
}
fields_size
} else {
// https://arrow.apache.org/docs/format/Columnar.html#sparse-union:
//
Expand Down

0 comments on commit b0f91ba

Please sign in to comment.