Skip to content

Commit

Permalink
explode accept expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 5, 2021
1 parent 371373e commit 2a31928
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 35 deletions.
20 changes: 4 additions & 16 deletions polars/polars-lazy/src/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -648,8 +648,7 @@ impl LazyFrame {
/// /// This function selects all columns except "foo"
/// fn exclude_a_column(df: DataFrame) -> LazyFrame {
/// df.lazy()
/// .select(&[col("*"),
/// except("foo")])
/// .select(&[col("*").exclude("foo")])
/// }
/// ```
pub fn select<E: AsRef<[Expr]>>(self, exprs: E) -> Self {
Expand Down Expand Up @@ -867,18 +866,7 @@ impl LazyFrame {
}

/// Apply explode operation. [See eager explode](polars_core::frame::DataFrame::explode).
pub fn explode(self, columns: &[Expr]) -> LazyFrame {
let columns = columns
.iter()
.map(|e| {
if let Expr::Column(name) = e {
(**name).clone()
} else {
panic!("expected column expression")
}
})
.collect();
// Note: this operation affects multiple columns. Therefore it isn't implemented as expression.
pub fn explode(self, columns: Vec<Expr>) -> LazyFrame {
let opt_state = self.get_opt_state();
let lp = self.get_plan_builder().explode(columns).build();
Self::from_logical_plan(lp, opt_state)
Expand Down Expand Up @@ -1441,7 +1429,7 @@ mod test {
)
.alias("diff_cases"),
])
.explode(&[col("day"), col("diff_cases")])
.explode(vec![col("day"), col("diff_cases")])
.join(
base_df,
vec![col("uid"), col("day")],
Expand Down Expand Up @@ -2296,7 +2284,7 @@ mod test {
col("b").list().alias("b_list"),
col("c").list().alias("c_list"),
])
.explode(&[col("c_list"), col("b_list")])
.explode(vec![col("c_list"), col("b_list")])
.collect()?;
assert_eq!(out.shape(), (5, 3));

Expand Down
14 changes: 13 additions & 1 deletion polars/polars-lazy/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1215,7 +1215,19 @@ impl LogicalPlanBuilder {
.into()
}

pub fn explode(self, columns: Vec<String>) -> Self {
pub fn explode(self, columns: Vec<Expr>) -> Self {
let columns = rewrite_projections(columns, self.0.schema());
// columns to string
let columns = columns
.iter()
.map(|e| {
if let Expr::Column(name) = e {
(**name).clone()
} else {
panic!("expected column expression")
}
})
.collect();
LogicalPlan::Explode {
input: Box::new(self.0),
columns,
Expand Down
65 changes: 60 additions & 5 deletions py-polars/polars/eager/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1855,7 +1855,9 @@ def fill_none(self, strategy: Union[str, "pl.Expr"]) -> "DataFrame":
return self.fill_none(pl.lit(strategy))
return wrap_df(self._df.fill_none(strategy))

def explode(self, columns: Union[str, tp.List[str]]) -> "DataFrame":
def explode(
self, columns: Union[str, tp.List[str], "pl.Expr", tp.List["pl.Expr"]]
) -> "DataFrame":
"""
Explode `DataFrame` to long format by exploding a column with Lists.
Expand All @@ -1867,10 +1869,63 @@ def explode(self, columns: Union[str, tp.List[str]]) -> "DataFrame":
Returns
-------
DataFrame
"""
if isinstance(columns, str):
columns = [columns]
return wrap_df(self._df.explode(columns))
Examples
--------
>>> df = pl.DataFrame({
>>> "letters": ["c", "c", "a", "c", "a", "b"],
>>> "nrs": [[1, 2], [1, 3], [4, 3], [5, 5, 5], [6], [2, 1, 2]]
>>> })
>>> df
shape: (6, 2)
╭─────────┬────────────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ list [i64] │
╞═════════╪════════════╡
│ "c" ┆ [1, 2] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "c" ┆ [1, 3] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "a" ┆ [4, 3] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "c" ┆ [5, 5, 5] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "a" ┆ [6] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "b" ┆ [2, 1, 2] │
╰─────────┴────────────╯
>>> df.explode("nrs")
shape: (13, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 2 │
╰─────────┴─────╯
"""
return self.lazy().explode(columns).collect(no_optimization=True)

def melt(
self, id_vars: Union[tp.List[str], str], value_vars: Union[tp.List[str], str]
Expand Down
64 changes: 60 additions & 4 deletions py-polars/polars/lazy/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,12 +676,68 @@ def quantile(self, quantile: float) -> "LazyFrame":
"""
return wrap_ldf(self._ldf.quantile(quantile))

def explode(self, columns: Union[str, tp.List[str]]) -> "LazyFrame":
def explode(
self, columns: Union[str, tp.List[str], "Expr", tp.List["Expr"]]
) -> "LazyFrame":
"""
Explode lists to long format.
"""
if isinstance(columns, str):
columns = [columns]
Examples
--------
>>> df = pl.DataFrame({
>>> "letters": ["c", "c", "a", "c", "a", "b"],
>>> "nrs": [[1, 2], [1, 3], [4, 3], [5, 5, 5], [6], [2, 1, 2]]
>>> })
>>> df
shape: (6, 2)
╭─────────┬────────────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ list [i64] │
╞═════════╪════════════╡
│ "c" ┆ [1, 2] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "c" ┆ [1, 3] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "a" ┆ [4, 3] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "c" ┆ [5, 5, 5] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "a" ┆ [6] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "b" ┆ [2, 1, 2] │
╰─────────┴────────────╯
>>> df.explode("nrs")
shape: (13, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 2 │
╰─────────┴─────╯
"""
columns = _selection_to_pyexpr_list(columns)
return wrap_ldf(self._ldf.explode(columns))

def drop_duplicates(
Expand Down
6 changes: 0 additions & 6 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -729,12 +729,6 @@ impl PyDataFrame {
PyDataFrame::new(self.df.clone())
}

pub fn explode(&self, columns: Vec<String>) -> PyResult<Self> {
let df = self.df.explode(&columns);
let df = df.map_err(PyPolarsEr::from)?;
Ok(PyDataFrame::new(df))
}

pub fn melt(&self, id_vars: Vec<&str>, value_vars: Vec<&str>) -> PyResult<Self> {
let df = self
.df
Expand Down
6 changes: 3 additions & 3 deletions py-polars/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,10 +321,10 @@ impl PyLazyFrame {
ldf.quantile(quantile).into()
}

pub fn explode(&self, column: Vec<String>) -> Self {
pub fn explode(&self, column: Vec<PyExpr>) -> Self {
let ldf = self.ldf.clone();
let column = column.into_iter().map(|s| col(&s)).collect::<Vec<_>>();
ldf.explode(&column).into()
let column = py_exprs_to_exprs(column);
ldf.explode(column).into()
}

pub fn drop_duplicates(&self, maintain_order: bool, subset: Option<Vec<String>>) -> Self {
Expand Down
51 changes: 51 additions & 0 deletions py-polars/tests/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,54 @@ def test_fold_filter():
)

assert out.shape == (3, 2)


def test_head_groupby():
commodity_prices = {
"commodity": [
"Wheat",
"Wheat",
"Wheat",
"Wheat",
"Corn",
"Corn",
"Corn",
"Corn",
"Corn",
],
"location": [
"StPaul",
"StPaul",
"StPaul",
"Chicago",
"Chicago",
"Chicago",
"Chicago",
"Chicago",
"Chicago",
],
"seller": [
"Bob",
"Charlie",
"Susan",
"Paul",
"Ed",
"Mary",
"Paul",
"Charlie",
"Norman",
],
"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],
}
df = pl.DataFrame(commodity_prices)

# this query flexes the wildcard exclusion quite a bit.
keys = ["commodity", "location"]
out = (
df.sort(by="price")
.groupby(keys)
.agg([col("*").exclude(keys).head(2).list().keep_name()])
.explode(col("*").exclude(keys))
)

assert out.shape == (5, 4)

0 comments on commit 2a31928

Please sign in to comment.