Skip to content

Commit

Permalink
improve explode of empty lists (#3638)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jun 9, 2022
1 parent 7d3e040 commit f27b129
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 69 deletions.
15 changes: 12 additions & 3 deletions polars/polars-core/src/chunked_array/ops/explode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,21 @@ pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec<IdxSiz

let mut count = 0;
let mut last_idx = 0;
for &offset in &offsets[1..] {
while count < offset {
for offset in &offsets[1..] {
while count < *offset {
count += 1;
idx.push(last_idx)
}
last_idx += 1;
// Safety:
// we started iterating from 1, so there is always a previous offset
// we take the pointer to the previous element and deref that to get
// the previous offset
unsafe {
let previous_offset = *(offset as *const i64).offset(-1);
if previous_offset != *offset {
last_idx += 1;
}
}
}
for _ in 0..(capacity - count as usize) {
idx.push(last_idx);
Expand Down
80 changes: 15 additions & 65 deletions polars/polars-core/src/frame/explode.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::chunked_array::ops::explode::offsets_to_indexes;
use crate::prelude::*;
use crate::utils::get_supertype;
use arrow::bitmap::{Bitmap, MutableBitmap};
use arrow::buffer::Buffer;
use polars_arrow::kernels::concatenate::concatenate_owned_unchecked;
#[cfg(feature = "serde")]
Expand Down Expand Up @@ -43,66 +42,6 @@ impl DataFrame {
.expect("cmp usize -> Ordering")
});

let verbose = std::env::var("POLARS_VERBOSE").is_ok();

// TODO: optimize this.
// This is the slower easier option.
// instead of filtering the whole dataframe first
// drop empty list rows
for col in &mut columns {
if let Ok(ca) = col.list() {
if !ca.can_fast_explode() {
if verbose {
eprintln!(
"could not fast explode column {}, running slower path",
ca.name()
);
}
let (_, offsets) = get_exploded(col)?;
if offsets.is_empty() {
let mut out = self.slice(0, 0);
// still explode the columns to get the inner dtype
for col in &columns {
out.try_apply(col.name(), |s| s.explode())?;
}

return Ok(out);
}

let mut mask = MutableBitmap::from_len_set(offsets.len() - 1);

let mut latest = offsets[0];
for (i, &o) in (&offsets[1..]).iter().enumerate() {
if o == latest {
unsafe { mask.set_bit_unchecked(i, false) }
}
latest = o;
}

let mask = Bitmap::from(mask);
// all lists are empty we return an an empty dataframe with the same schema
if mask.null_count() == mask.len() {
return Ok(self.slice(0, 0));
}

let idx = self.check_name_to_idx(col.name())?;
df = df.filter(&BooleanChunked::from_chunks(
"",
vec![Arc::new(BooleanArray::from_data_default(mask, None))],
))?;
*col = df[idx].clone();
let ca = col
._get_inner_mut()
.as_any_mut()
.downcast_mut::<ListChunked>()
.unwrap();
ca.set_fast_explode();
} else if verbose {
eprintln!("could fast explode column {}", ca.name());
}
}
}

// first remove all the exploded columns
for s in &columns {
df = df.drop(s.name())?;
Expand All @@ -119,6 +58,7 @@ impl DataFrame {
let row_idx = offsets_to_indexes(&offsets, exploded.len());
let mut row_idx = IdxCa::from_vec("", row_idx);
row_idx.set_sorted(false);

// Safety
// We just created indices that are in bounds.
df = unsafe { df.take_unchecked(&row_idx) };
Expand Down Expand Up @@ -415,12 +355,22 @@ mod test {

let out = df.explode(["foo"])?;
let expected = df![
"foo" => [1, 2, 3, 1, 1, 1],
"B" => [1, 1, 1, 2, 2, 2],
"C" => [1, 1, 1, 1, 1, 1],
"foo" => [Some(1), Some(2), Some(3), Some(1), Some(1), Some(1), None],
"B" => [1, 1, 1, 2, 2, 2, 3],
"C" => [1, 1, 1, 1, 1, 1, 1],
]?;

assert!(out.frame_equal(&expected));
assert!(out.frame_equal_missing(&expected));

let list = Series::new("foo", &[s0.clone(), s1.slice(0, 0), s1.clone()]);
let df = DataFrame::new(vec![list, s0.clone(), s1.clone()])?;
let out = df.explode(["foo"])?;
let expected = df![
"foo" => [Some(1), Some(2), Some(3), None, Some(1), Some(1), Some(1)],
"B" => [1, 1, 1, 2, 2, 2, 3],
"C" => [1, 1, 1, 1, 1, 1, 1],
]?;
assert!(out.frame_equal_missing(&expected));
Ok(())
}

Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -2117,7 +2117,7 @@ def test_explode_empty() -> None:

df = pl.DataFrame(dict(x=["1", "2", "4"], y=[["a", "b", "c"], ["d"], []]))
assert df.explode("y").frame_equal(
pl.DataFrame({"x": ["1", "1", "1", "2"], "y": ["a", "b", "c", "d"]})
pl.DataFrame({"x": ["1", "1", "1", "2", "4"], "y": ["a", "b", "c", "d", None]})
)


Expand Down

0 comments on commit f27b129

Please sign in to comment.