Skip to content

Commit

Permalink
feat[python]: unstack operation (#4777)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Sep 8, 2022
1 parent d07ee55 commit 6a578a5
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 0 deletions.
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ Manipulation/ selection
DataFrame.transpose
DataFrame.unique
DataFrame.unnest
DataFrame.unstack
DataFrame.upsample
DataFrame.vstack
DataFrame.with_column
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)
from polars.internals.lazy_functions import (
all,
arange,
arg_where,
argsort_by,
col,
Expand All @@ -47,6 +48,7 @@
"LazyFrame",
"Series",
"all",
"arange",
"arg_where",
"argsort_by",
"col",
Expand Down
141 changes: 141 additions & 0 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module containing logic related to eager DataFrames."""
from __future__ import annotations

import math
import os
import sys
from io import BytesIO, IOBase, StringIO
Expand Down Expand Up @@ -121,12 +122,14 @@
PivotAgg,
SizeUnit,
UniqueKeepStrategy,
UnstackDirection,
)

# these aliases are used to annotate DataFrame.__getitem__()
# MultiRowSelector indexes into the vertical axis and
# MultiColSelector indexes into the horizontal axis
# NOTE: wrapping these as strings is necessary for Python <3.10

MultiRowSelector: TypeAlias = "slice | range | list[int] | pli.Series"
MultiColSelector: TypeAlias = (
"slice | range | list[int] | list[str] | list[bool] | pli.Series"
Expand Down Expand Up @@ -4733,6 +4736,144 @@ def melt(
self._df.melt(id_vars, value_vars, value_name, variable_name)
)

def unstack(
self: DF,
step: int,
how: UnstackDirection = "vertical",
columns: str | list[str] | None = None,
fill_values: list[Any] | None = None,
) -> DF:
"""
Unstack a long table to a wide form without doing an aggregation.
This can be much faster than a pivot, because it can skip the grouping phase.
Warnings
--------
This functionality is experimental and may be subject to changes
without it being considered a breaking change.
Parameters
----------
step
Number of rows in the unstacked frame.
how : { 'vertical', 'horizontal' }
Direction of the unstack.
columns
Column to include in the operation.
fill_values
Fill values that don't fit the new size with this value.
Examples
--------
>>> from string import ascii_uppercase
>>> df = pl.DataFrame(
... {
... "col1": ascii_uppercase[0:9],
... "col2": pl.arange(0, 9, eager=True),
... }
... )
>>> df
shape: (9, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ i64 │
╞══════╪══════╡
│ A ┆ 0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ B ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ C ┆ 2 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ D ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ F ┆ 5 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ G ┆ 6 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ H ┆ 7 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ I ┆ 8 │
└──────┴──────┘
>>> df.unstack(step=3, how="vertical")
shape: (3, 6)
┌────────┬────────┬────────┬────────┬────────┬────────┐
│ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
╞════════╪════════╪════════╪════════╪════════╪════════╡
│ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
└────────┴────────┴────────┴────────┴────────┴────────┘
>>> df.unstack(step=3, how="horizontal")
shape: (3, 6)
┌────────┬────────┬────────┬────────┬────────┬────────┐
│ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
╞════════╪════════╪════════╪════════╪════════╪════════╡
│ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
└────────┴────────┴────────┴────────┴────────┴────────┘
"""
if columns is not None:
df = self.select(columns)
else:
df = self

height = df.height
if how == "vertical":
n_rows = step
n_cols = math.ceil(height / n_rows)
else:
n_cols = step
n_rows = math.ceil(height / n_cols)

n_fill = n_cols * n_rows - height

if n_fill:
if not isinstance(fill_values, list):
fill_values = [fill_values for _ in range(0, df.width)]

df = df.select(
[
s.extend_constant(next_fill, n_fill)
for s, next_fill in zip(df, fill_values)
]
)

if how == "horizontal":
df = (
df.with_column( # type: ignore[assignment]
(pli.arange(0, n_cols * n_rows, eager=True) % n_cols).alias(
"__sort_order"
),
)
.sort("__sort_order")
.drop("__sort_order")
)

zfill_val = math.floor(math.log10(n_cols)) + 1
slices = [
s.slice(slice_nbr * n_rows, n_rows).alias(
s.name + "_" + str(slice_nbr).zfill(zfill_val)
)
for s in df
for slice_nbr in range(0, n_cols)
]

return self._from_pydf(DataFrame(slices)._df)

@overload
def partition_by(
self: DF,
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/internals/type_aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
IpcCompression: TypeAlias = Literal["uncompressed", "lz4", "zstd"]
NullBehavior: TypeAlias = Literal["ignore", "drop"]
NullStrategy: TypeAlias = Literal["ignore", "propagate"]
UnstackDirection: TypeAlias = Literal["vertical", "horizontal"]
ParallelStrategy: TypeAlias = Literal["auto", "columns", "row_groups", "none"]
ParquetCompression: TypeAlias = Literal[
"lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"
Expand Down

0 comments on commit 6a578a5

Please sign in to comment.