Skip to content

Commit

Permalink
explode operation
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 3, 2020
1 parent 1cc156a commit 7355281
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 1 deletion.
4 changes: 3 additions & 1 deletion polars/src/doc/changelog/v0_7.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@
//! - quantile
//! - median
//! - last
//!
//! - group indexes
//! - agg (combined aggregations)
//! * explode operation
159 changes: 159 additions & 0 deletions polars/src/frame/explode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
use crate::prelude::*;

impl LargeListChunked {
pub fn explode(&self) -> Result<(Series, Vec<usize>)> {
macro_rules! impl_with_builder {
($self:expr, $builder:expr, $dtype:ty) => {{
let mut row_idx = Vec::with_capacity($self.len() * 10);

for i in 0..$self.len() {
match $self.get(i) {
Some(series) => {
let ca = series.unpack::<$dtype>()?;
if ca.null_count() == 0 {
ca.into_no_null_iter().for_each(|v| {
$builder.append_value(v);
row_idx.push(i)
})
} else {
ca.into_iter().for_each(|opt_v| {
$builder.append_option(opt_v);
row_idx.push(i)
})
}
}
None => {
$builder.append_null();
row_idx.push(i)
}
}
}
let exploded = $builder.finish().into_series();
Ok((exploded, row_idx))
}};
}

macro_rules! impl_primitive {
($dtype:ty, $self:expr) => {{
// the 10 is an avg length of 10 elements in every Series.
// A better alternative?
let mut builder =
PrimitiveChunkedBuilder::<$dtype>::new($self.name(), $self.len() * 10);
impl_with_builder!(self, builder, $dtype)
}};
}
macro_rules! impl_utf8 {
($self:expr) => {{
let mut builder = Utf8ChunkedBuilder::new($self.name(), $self.len() * 10);
impl_with_builder!(self, builder, Utf8Type)
}};
}

match_arrow_data_type_apply_macro!(
**self.get_inner_dtype(),
impl_primitive,
impl_utf8,
self
)
}
}

impl DataFrame {
/// Explode `DataFrame` to long format by exploding a column with Lists.
///
/// # Example
///
/// ```rust
/// use polars::prelude::*;
/// use polars::chunked_array::builder::get_large_list_builder;
///
/// let mut builder = get_large_list_builder(&ArrowDataType::Int8, 3, "foo");
/// builder.append_series(&Series::new("a", &[1i8, 2, 3]));
/// builder.append_series(&Series::new("b", &[1i8, 1, 1]));
/// builder.append_series(&Series::new("c", &[2i8, 2, 2]));
/// let list = builder.finish().into_series();
///
/// let s = Series::new("B", [1, 2, 3]);
/// let s1 = Series::new("C", [1, 1, 1]);
/// let df = DataFrame::new(vec![list, s, s1]).unwrap();
/// let exploded = df.explode("foo").unwrap();
///
/// println!("{:?}", df);
/// println!("{:?}", exploded);
/// ```
/// Outputs:
///
/// ```text
/// +-------------+-----+-----+
/// | foo | B | C |
/// | --- | --- | --- |
/// | list [i8] | i32 | i32 |
/// +=============+=====+=====+
/// | "[1, 2, 3]" | 1 | 1 |
/// +-------------+-----+-----+
/// | "[1, 1, 1]" | 2 | 1 |
/// +-------------+-----+-----+
/// | "[2, 2, 2]" | 3 | 1 |
/// +-------------+-----+-----+
///
/// +-----+-----+-----+
/// | foo | B | C |
/// | --- | --- | --- |
/// | i8 | i32 | i32 |
/// +=====+=====+=====+
/// | 1 | 1 | 1 |
/// +-----+-----+-----+
/// | 2 | 1 | 1 |
/// +-----+-----+-----+
/// | 3 | 1 | 1 |
/// +-----+-----+-----+
/// | 1 | 2 | 1 |
/// +-----+-----+-----+
/// | 1 | 2 | 1 |
/// +-----+-----+-----+
/// | 1 | 2 | 1 |
/// +-----+-----+-----+
/// | 2 | 3 | 1 |
/// +-----+-----+-----+
/// | 2 | 3 | 1 |
/// +-----+-----+-----+
/// | 2 | 3 | 1 |
/// +-----+-----+-----+
/// ```
pub fn explode(&self, column: &str) -> Result<DataFrame> {
let s = self.column(column)?;
if let Series::LargeList(ca) = s {
let (exploded, row_idx) = ca.explode()?;
let col_idx = self.name_to_idx(column)?;
let df = self.drop(column)?;
let mut df = unsafe { df.take_iter_unchecked(row_idx.into_iter(), None) };
df.columns.insert(col_idx, exploded);
Ok(df)
} else {
Ok(self.clone())
}
}
}

#[cfg(test)]
mod test {
use crate::chunked_array::builder::get_large_list_builder;
use crate::prelude::*;

#[test]
fn test_explode() {
let mut builder = get_large_list_builder(&ArrowDataType::Int8, 3, "foo");
builder.append_series(&Series::new("a", &[1i8, 2, 3]));
builder.append_series(&Series::new("b", &[1i8, 1, 1]));
builder.append_series(&Series::new("c", &[2i8, 2, 2]));
let list = builder.finish().into_series();

let s = Series::new("B", [1, 2, 3]);
let s1 = Series::new("C", [1, 1, 1]);
let df = DataFrame::new(vec![list, s, s1]).unwrap();
let exploded = df.explode("foo").unwrap();
println!("{:?}", df);
println!("{:?}", exploded);
assert_eq!(exploded.shape(), (9, 3));
}
}
1 change: 1 addition & 0 deletions polars/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use std::marker::Sized;
use std::mem;
use std::sync::Arc;

pub mod explode;
pub mod group_by;
pub mod hash_join;
pub mod select;
Expand Down
15 changes: 15 additions & 0 deletions py-polars/pypolars/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,21 @@ def fill_none(self, strategy: str) -> DataFrame:
"""
return wrap_df(self._df.fill_none(strategy))

def explode(self, column: str) -> DataFrame:
"""
Explode `DataFrame` to long format by exploding a column with Lists.
Parameters
----------
column
Column of LargeList type
Returns
-------
DataFrame
"""
return wrap_df(self._df.explode(column))


class GroupBy:
def __init__(self, df: DataFrame, by: List[str]):
Expand Down
6 changes: 6 additions & 0 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,10 @@ impl PyDataFrame {
pub fn clone(&self) -> Self {
PyDataFrame::new(self.df.clone())
}

pub fn explode(&self, column: &str) -> PyResult<Self> {
let df = self.df.explode(column);
let df = df.map_err(PyPolarsEr::from)?;
Ok(PyDataFrame::new(df))
}
}

0 comments on commit 7355281

Please sign in to comment.