Skip to content

Commit

Permalink
fix explode empty lists (#3083)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 7, 2022
1 parent 12425dc commit 9e3b929
Show file tree
Hide file tree
Showing 10 changed files with 136 additions and 4 deletions.
1 change: 1 addition & 0 deletions polars/polars-arrow/src/bitmap/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod mutable;
33 changes: 33 additions & 0 deletions polars/polars-arrow/src/bitmap/mutable.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
use arrow::bitmap::MutableBitmap;

pub trait MutableBitmapExtension {
/// Initializes a [`MutableBitmap`] with all values set to valid/ true.
fn from_len_set(length: usize) -> MutableBitmap {
let values = vec![u8::MAX; length.saturating_add(7) / 8];
MutableBitmap::from_vec(values, length)
}

fn as_slice_mut(&mut self) -> &mut [u8];

/// # Safety
/// Caller must ensure `i` is in bounds.
unsafe fn set_bit_unchecked(&mut self, i: usize, value: bool);
}

impl MutableBitmapExtension for MutableBitmap {
fn as_slice_mut(&mut self) -> &mut [u8] {
let slice = self.as_slice();
unsafe { std::slice::from_raw_parts_mut(slice.as_ptr() as *mut u8, slice.len()) }
}

unsafe fn set_bit_unchecked(&mut self, i: usize, value: bool) {
#[cfg(debug_assertions)]
{
arrow::bitmap::utils::set_bit(self.as_slice_mut(), i, value)
}
#[cfg(not(debug_assertions))]
{
arrow::bitmap::utils::set_bit_unchecked(self.as_slice_mut(), i, value)
}
}
}
1 change: 1 addition & 0 deletions polars/polars-arrow/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod array;
pub mod bit_util;
mod bitmap;
#[cfg(feature = "compute")]
pub mod compute;
pub mod conversion;
Expand Down
1 change: 1 addition & 0 deletions polars/polars-arrow/src/prelude.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pub use crate::array::default_arrays::*;
pub use crate::bitmap::mutable::MutableBitmapExtension;
pub use crate::kernels::rolling::no_nulls::QuantileInterpolOptions;
pub use crate::{array::*, index::*};
use arrow::array::{ListArray, Utf8Array};
Expand Down
5 changes: 4 additions & 1 deletion polars/polars-core/src/chunked_array/ops/explode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,14 @@ impl ExplodeByOffsets for Utf8Chunked {

/// Convert Arrow array offsets to indexes of the original list
pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec<IdxSize> {
if offsets.is_empty() {
return vec![];
}
let mut idx = Vec::with_capacity(capacity);

let mut count = 0;
let mut last_idx = 0;
for &offset in offsets.iter().skip(1) {
for &offset in &offsets[1..] {
while count < offset {
count += 1;
idx.push(last_idx)
Expand Down
76 changes: 74 additions & 2 deletions polars/polars-core/src/frame/explode.rs
Original file line number Diff line number Diff line change
@@ -1,28 +1,79 @@
use crate::chunked_array::ops::explode::offsets_to_indexes;
use crate::prelude::*;
use crate::utils::get_supertype;
use arrow::bitmap::{Bitmap, MutableBitmap};
use arrow::buffer::Buffer;
use polars_arrow::kernels::concatenate::concatenate_owned_unchecked;

fn get_exploded(series: &Series) -> Result<(Series, Buffer<i64>)> {
match series.dtype() {
DataType::List(_) => series.list().unwrap().explode_and_offsets(),
DataType::Utf8 => series.utf8().unwrap().explode_and_offsets(),
_ => Err(PolarsError::InvalidOperation("".into())),
_ => Err(PolarsError::InvalidOperation(
format!("cannot explode dtype: {:?}", series.dtype()).into(),
)),
}
}

impl DataFrame {
pub fn explode_impl(&self, mut columns: Vec<Series>) -> Result<DataFrame> {
if self.height() == 0 {
return Ok(self.clone());
}
columns.sort_by(|sa, sb| {
self.check_name_to_idx(sa.name())
.expect("checked above")
.partial_cmp(&self.check_name_to_idx(sb.name()).expect("checked above"))
.expect("cmp usize -> Ordering")
});

// first remove all the exploded columns
let mut df = self.clone();

// TODO: optimize this.
// This is the slower easier option.
// instead of filtering the whole dataframe first
// drop empty list rows
for col in &mut columns {
if let Ok(ca) = col.list() {
if !ca.can_fast_explode() {
let (_, offsets) = get_exploded(col)?;
if offsets.is_empty() {
return Ok(self.slice(0, 0));
}

let mut mask = MutableBitmap::from_len_set(offsets.len() - 1);

let mut latest = offsets[0];
for (i, &o) in (&offsets[1..]).iter().enumerate() {
if o == latest {
unsafe { mask.set_bit_unchecked(i, false) }
}
latest = o;
}

let mask = Bitmap::from(mask);
// all lists are empty we return an an empty dataframe with the same schema
if mask.null_count() == mask.len() {
return Ok(self.slice(0, 0));
}

let idx = self.check_name_to_idx(col.name())?;
df = df.filter(&BooleanChunked::from_chunks(
"",
vec![Arc::new(BooleanArray::from_data_default(mask, None))],
))?;
*col = df[idx].clone();
let ca = col
.get_inner_mut()
.as_any_mut()
.downcast_mut::<ListChunked>()
.unwrap();
ca.set_fast_explode();
}
}
}

// first remove all the exploded columns
for s in &columns {
df = df.drop(s.name())?;
}
Expand Down Expand Up @@ -306,6 +357,27 @@ mod test {
);
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_explode_df_empty_list() -> Result<()> {
let s0 = Series::new("a", &[1, 2, 3]);
let s1 = Series::new("b", &[1, 1, 1]);
let list = Series::new("foo", &[s0, s1.clone(), s1.slice(0, 0)]);
let s0 = Series::new("B", [1, 2, 3]);
let s1 = Series::new("C", [1, 1, 1]);
let df = DataFrame::new(vec![list, s0.clone(), s1.clone()])?;

let out = df.explode(["foo"])?;
let expected = df![
"foo" => [1, 2, 3, 1, 1, 1],
"B" => [1, 1, 1, 2, 2, 2],
"C" => [1, 1, 1, 1, 1, 1],
]?;

assert!(out.frame_equal(&expected));
Ok(())
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_explode_single_col() -> Result<()> {
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/src/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ pub use crate::{
};
pub(crate) use arrow::array::*;
pub use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema};
pub use polars_arrow::prelude::{LargeListArray, LargeStringArray, QuantileInterpolOptions};
pub(crate) use polars_arrow::trusted_len::TrustedLen;
pub use std::sync::Arc;

Expand All @@ -56,6 +55,7 @@ pub use crate::chunked_array::ops::rolling_window::RollingOptions;
pub use polars_arrow::kernels::ewm::EWMOptions;

pub(crate) use polars_arrow::export::*;
pub use polars_arrow::prelude::*;

#[cfg(feature = "dtype-categorical")]
pub use crate::chunked_array::logical::categorical::*;
Expand Down
10 changes: 10 additions & 0 deletions polars/polars-core/src/series/implementations/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use crate::prelude::*;
use crate::series::implementations::SeriesWrap;
use arrow::array::ArrayRef;
use polars_arrow::prelude::QuantileInterpolOptions;
use std::any::Any;
use std::borrow::Cow;

impl IntoSeries for ListChunked {
Expand Down Expand Up @@ -240,4 +241,13 @@ impl SeriesTrait for SeriesWrap<ListChunked> {
fn clone_inner(&self) -> Arc<dyn SeriesTrait> {
Arc::new(SeriesWrap(Clone::clone(&self.0)))
}
fn as_any(&self) -> &dyn Any {
&self.0
}

/// Get a hold to self as `Any` trait reference.
/// Only implemented for ObjectType
fn as_any_mut(&mut self) -> &mut dyn Any {
&mut self.0
}
}
6 changes: 6 additions & 0 deletions polars/polars-core/src/series/series_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,12 @@ pub trait SeriesTrait:
invalid_operation_panic!(self)
}

/// Get a hold to self as `Any` trait reference.
/// Only implemented for ObjectType
fn as_any_mut(&mut self) -> &mut dyn Any {
invalid_operation_panic!(self)
}

/// Raise a numeric series to the power of exponent.
fn pow(&self, _exponent: f64) -> Result<Series> {
Err(PolarsError::InvalidOperation(
Expand Down
5 changes: 5 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1871,6 +1871,11 @@ def test_explode_empty() -> None:
)
assert df.explode("y").shape == (0, 2)

df = pl.DataFrame(dict(x=["1", "2", "4"], y=[["a", "b", "c"], ["d"], []]))
assert df.explode("y").frame_equal(
pl.DataFrame({"x": ["1", "1", "1", "2"], "y": ["a", "b", "c", "d"]})
)


def test_asof_by_multiple_keys() -> None:
lhs = pl.DataFrame(
Expand Down

0 comments on commit 9e3b929

Please sign in to comment.