Skip to content

Commit

Permalink
fix explode edge cases (#4133)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 24, 2022
1 parent a006a14 commit 75486c0
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 19 deletions.
75 changes: 56 additions & 19 deletions polars/polars-core/src/chunked_array/ops/explode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ unsafe fn unset_nulls(
validity_values: &Bitmap,
nulls: &mut Vec<usize>,
empty_row_idx: &[usize],
base_offset: usize,
) {
for i in start..last {
if !validity_values.get_bit_unchecked(i) {
nulls.push(i + empty_row_idx.len());
nulls.push(i + empty_row_idx.len() - base_offset);
}
}
}
Expand All @@ -32,15 +33,19 @@ where
fn explode_by_offsets(&self, offsets: &[i64]) -> Series {
debug_assert_eq!(self.chunks.len(), 1);
let arr = self.downcast_iter().next().unwrap();
let values = arr.values();

let mut new_values = Vec::with_capacity(((values.len() as f32) * 1.5) as usize);
// make sure that we don't look beyond the sliced array
let values = &arr.values().as_slice()[..offsets[offsets.len() - 1] as usize];

let mut empty_row_idx = vec![];
let mut nulls = vec![];

let mut start = offsets[0] as usize;
let base_offset = start;
let mut last = start;

let mut new_values = Vec::with_capacity(offsets[offsets.len() - 1] as usize - start + 1);

// we check all the offsets and in the case a consecutive offset is the same,
// e.g. 0, 1, 4, 4, 6
// the 4 4, means that that is an empty row.
Expand All @@ -60,15 +65,29 @@ where
let o = o as usize;
if o == last {
if start != last {
#[cfg(debug_assertions)]
new_values.extend_from_slice(&values[start..last]);

#[cfg(not(debug_assertions))]
unsafe {
new_values.extend_from_slice(values.get_unchecked(start..last))
};

// Safety:
// we are in bounds
unsafe {
unset_nulls(start, last, validity_values, &mut nulls, &empty_row_idx)
unset_nulls(
start,
last,
validity_values,
&mut nulls,
&empty_row_idx,
base_offset,
)
}
}

empty_row_idx.push(o + empty_row_idx.len());
empty_row_idx.push(o + empty_row_idx.len() - base_offset);
new_values.push(T::Native::default());
start = o;
}
Expand All @@ -78,23 +97,39 @@ where
// final null check
// Safety:
// we are in bounds
unsafe { unset_nulls(start, last, validity_values, &mut nulls, &empty_row_idx) }
unsafe {
unset_nulls(
start,
last,
validity_values,
&mut nulls,
&empty_row_idx,
base_offset,
)
}
} else {
for &o in &offsets[1..] {
let o = o as usize;
if o == last {
if start != last {
new_values.extend_from_slice(&values[start..last])
#[cfg(debug_assertions)]
new_values.extend_from_slice(&values[start..last]);

#[cfg(not(debug_assertions))]
unsafe {
new_values.extend_from_slice(values.get_unchecked(start..last))
};
}

empty_row_idx.push(o + empty_row_idx.len());
empty_row_idx.push(o + empty_row_idx.len() - base_offset);
new_values.push(T::Native::default());
start = o;
}
last = o;
}
}

// add remaining values
new_values.extend_from_slice(&values[start..]);

let mut validity = MutableBitmap::with_capacity(new_values.len());
Expand Down Expand Up @@ -310,17 +345,19 @@ impl ChunkExplode for ListChunked {
));
}

// ensure that the value array is sliced
// as a list only slices its offsets on a slice operation
if !offsets.is_empty() {
let start = offsets[0] as usize;
let len = offsets[offsets.len() - 1] as usize - start;
// safety:
// we are in bounds
values = unsafe { values.slice_unchecked(start, len) };
}

let mut s = if ca.can_fast_explode() {
// ensure that the value array is sliced
// as a list only slices its offsets on a slice operation

// we only do this in fast-explode as for the other
// branch the offsets must coincide with the values.
if !offsets.is_empty() {
let start = offsets[0] as usize;
let len = offsets[offsets.len() - 1] as usize - start;
// safety:
// we are in bounds
values = unsafe { values.slice_unchecked(start, len) };
}
Series::try_from((self.name(), values)).unwrap()
} else {
// during tests
Expand All @@ -335,7 +372,7 @@ impl ChunkExplode for ListChunked {
}
last = o;
}
if !has_empty {
if !has_empty && offsets[0] == 0 {
panic!("could have fast exploded")
}
}
Expand Down
20 changes: 20 additions & 0 deletions py-polars/tests/test_explode.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,23 @@ def explode_correct_for_slice() -> None:
}
)
assert df.slice(0, 10).explode(["b"]).frame_equal(expected)


def test_sliced_null_explode() -> None:
s = pl.Series("", [[1], [2], [3], [4], [], [6]])
assert s.slice(2, 4).explode().to_list() == [3, 4, None, 6]
assert s.slice(2, 2).explode().to_list() == [3, 4]
assert pl.Series("", [[1], [2], None, [4], [], [6]]).slice(
2, 4
).explode().to_list() == [None, 4, None, 6]

s = pl.Series("", [["a"], ["b"], ["c"], ["d"], [], ["e"]])
assert s.slice(2, 4).explode().to_list() == ["c", "d", None, "e"]
assert s.slice(2, 2).explode().to_list() == ["c", "d"]
assert pl.Series("", [["a"], ["b"], None, ["d"], [], ["e"]]).slice(
2, 4
).explode().to_list() == [None, "d", None, "e"]

s = pl.Series("", [[False], [False], [True], [False], [], [True]])
assert s.slice(2, 2).explode().to_list() == [True, False]
assert s.slice(2, 4).explode().to_list() == [True, False, None, True]

0 comments on commit 75486c0

Please sign in to comment.