Skip to content

Commit

Permalink
Lazy: groupby().head/tail
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 5, 2021
1 parent 2a31928 commit 32a71e7
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 4 deletions.
28 changes: 27 additions & 1 deletion polars/polars-lazy/src/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::physical_plan::state::ExecutionState;
use crate::prelude::aggregate_scan_projections::agg_projection;
use crate::prelude::fast_projection::FastProjection;
use crate::prelude::simplify_expr::SimplifyBooleanRule;
use crate::utils::combine_predicates_expr;
use crate::utils::{combine_predicates_expr, expr_to_root_column_names};
use crate::{logical_plan::FETCH_ROWS, prelude::*};
use polars_io::csv::NullValues;

Expand Down Expand Up @@ -1001,6 +1001,32 @@ impl LazyGroupBy {
LazyFrame::from_logical_plan(lp, self.opt_state)
}

/// Return first n rows of each group
pub fn head(self, n: Option<usize>) -> LazyFrame {
let keys = self
.keys
.iter()
.map(|k| expr_to_root_column_names(k).into_iter())
.flatten()
.collect::<Vec<_>>();

self.agg(vec![col("*").exclude(&keys).head(n).list().keep_name()])
.explode(vec![col("*").exclude(&keys)])
}

/// Return last n rows of each group
pub fn tail(self, n: Option<usize>) -> LazyFrame {
let keys = self
.keys
.iter()
.map(|k| expr_to_root_column_names(k).into_iter())
.flatten()
.collect::<Vec<_>>();

self.agg(vec![col("*").exclude(&keys).tail(n).list().keep_name()])
.explode(vec![col("*").exclude(&keys)])
}

/// Apply a function over the groups as a new `DataFrame`. It is not recommended that you use
/// this as materializing the `DataFrame` is quite expensive.
pub fn apply<F>(self, f: F) -> LazyFrame
Expand Down
4 changes: 3 additions & 1 deletion py-polars/docs/source/reference/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,11 @@ This namespace comes available by calling `DataFrame.groupby(..)`.
:toctree: api/

GroupBy.agg
GroupBy.apply
GroupBy.head
GroupBy.tail
GroupBy.get_group
GroupBy.groups
GroupBy.apply
GroupBy.select
GroupBy.select_all
GroupBy.pivot
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/lazyframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,5 @@ This namespace comes available by calling `LazyFrame.groupby(..)`.

LazyGroupBy.agg
LazyGroupBy.apply
LazyGroupBy.head
LazyGroupBy.tail
141 changes: 139 additions & 2 deletions py-polars/polars/eager/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2429,11 +2429,11 @@ def agg(
>>> # use lazy API
>>> (df.groupby(["foo", "bar])
.agg([pl.sum("ham"), col("spam").tail(4).sum()])
>>> .agg([pl.sum("ham"), col("spam").tail(4).sum()])
>>> # use a dict
>>> (df.groupby(["foo", "bar])
.agg({"spam": ["sum", "min"})
>>> .agg({"spam": ["sum", "min"})
"""
if isinstance(column_to_agg, pl.Expr):
Expand Down Expand Up @@ -2479,8 +2479,142 @@ def agg(

return wrap_df(self._df.groupby_agg(self.by, column_to_agg))

def head(self, n: int = 5) -> DataFrame:
"""
Return first n rows of each group.
Parameters
----------
n
Number of values of the group to select
Examples
--------
>>> df = pl.DataFrame({
>>> "letters": ["c", "c", "a", "c", "a", "b"],
>>> "nrs": [1, 2, 3, 4, 5, 6]
>>> })
>>> df
shape: (6, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
╰─────────┴─────╯
>>> (df.groupby("letters")
>>> .head(2)
>>> .sort("letters")
>>> )
shape: (5, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
╰─────────┴─────╯
"""
return (
wrap_df(self._df)
.lazy()
.groupby(self.by)
.head(n) # type: ignore[arg-type]
.collect(no_optimization=True, string_cache=False)
)

def tail(self, n: int = 5) -> DataFrame:
"""
Return last n rows of each group.
Parameters
----------
n
Number of values of the group to select
Examples
--------
>>> df = pl.DataFrame({
>>> "letters": ["c", "c", "a", "c", "a", "b"],
>>> "nrs": [1, 2, 3, 4, 5, 6]
>>> })
>>> df
shape: (6, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
╰─────────┴─────╯
>>> (df.groupby("letters")
>>> .tail(2)
>>> .sort("letters")
>>> )
shape: (5, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
╰─────────┴─────╯
"""
return (
wrap_df(self._df)
.lazy()
.groupby(self.by)
.tail(n) # type: ignore[arg-type]
.collect(no_optimization=True, string_cache=False)
)

def select(self, columns: Union[str, tp.List[str]]) -> "GBSelection":
"""
.. deprecated:: 0.8.16
Use `groupby.agg(col("selection"))` instead
Select the columns that will be aggregated.
Parameters
Expand All @@ -2496,6 +2630,9 @@ def select(self, columns: Union[str, tp.List[str]]) -> "GBSelection":

def select_all(self) -> "GBSelection":
"""
.. deprecated:: 0.8.16
Use `groupby.agg(col("*"))` instead
Select all columns for aggregation.
"""
return GBSelection(
Expand Down
118 changes: 118 additions & 0 deletions py-polars/polars/lazy/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,124 @@ def agg(self, aggs: Union[tp.List["Expr"], "Expr"]) -> "LazyFrame":
aggs = _selection_to_pyexpr_list(aggs)
return wrap_ldf(self.lgb.agg(aggs))

def head(self, n: int = 5) -> "LazyFrame":
"""
Return first n rows of each group.
Parameters
----------
n
Number of values of the group to select
Examples
--------
>>> df = pl.DataFrame({
>>> "letters": ["c", "c", "a", "c", "a", "b"],
>>> "nrs": [1, 2, 3, 4, 5, 6]
>>> })
>>> df
shape: (6, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
╰─────────┴─────╯
>>> (df.groupby("letters")
>>> .head(2)
>>> .sort("letters")
>>> )
shape: (5, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
╰─────────┴─────╯
"""
return wrap_ldf(self.lgb.head(n))

def tail(self, n: int = 5) -> "LazyFrame":
"""
Return last n rows of each group.
Parameters
----------
n
Number of values of the group to select
Examples
--------
>>> df = pl.DataFrame({
>>> "letters": ["c", "c", "a", "c", "a", "b"],
>>> "nrs": [1, 2, 3, 4, 5, 6]
>>> })
>>> df
shape: (6, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
╰─────────┴─────╯
>>> (df.groupby("letters")
>>> .tail(2)
>>> .sort("letters")
>>> )
shape: (5, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
╰─────────┴─────╯
"""
return wrap_ldf(self.lgb.tail(n))

def apply(self, f: Callable[["pl.DataFrame"], "pl.DataFrame"]) -> "LazyFrame":
"""
Apply a function over the groups as a new `DataFrame`. It is not recommended that you use
Expand Down
10 changes: 10 additions & 0 deletions py-polars/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@ impl PyLazyGroupBy {
lgb.agg(aggs).into()
}

pub fn head(&mut self, n: usize) -> PyLazyFrame {
let lgb = self.lgb.take().unwrap();
lgb.head(Some(n)).into()
}

pub fn tail(&mut self, n: usize) -> PyLazyFrame {
let lgb = self.lgb.take().unwrap();
lgb.tail(Some(n)).into()
}

pub fn apply(&mut self, lambda: PyObject) -> PyLazyFrame {
let lgb = self.lgb.take().unwrap();

Expand Down
13 changes: 13 additions & 0 deletions py-polars/tests/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,3 +301,16 @@ def test_head_groupby():
)

assert out.shape == (5, 4)

df = pl.DataFrame(
{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}
)

out = df.groupby("letters").tail(2).sort("letters")
assert out.frame_equal(
pl.DataFrame({"str": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]})
)
out = df.groupby("letters").head(2).sort("letters")
assert out.frame_equal(
pl.DataFrame({"str": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]})
)

0 comments on commit 32a71e7

Please sign in to comment.