Skip to content

Commit

Permalink
add pl.cut utility (#4137)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 24, 2022
1 parent 75486c0 commit 3668df5
Show file tree
Hide file tree
Showing 9 changed files with 326 additions and 202 deletions.
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Eager/Lazy functions

arg_where
concat
cut
date_range
get_dummies
repeat
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def version() -> str:
)
from polars.internals.expr import Expr
from polars.internals.frame import DataFrame, wrap_df # TODO: remove need for wrap_df
from polars.internals.functions import concat, date_range, get_dummies
from polars.internals.functions import concat, cut, date_range, get_dummies
from polars.internals.io import read_ipc_schema, read_parquet_schema
from polars.internals.lazy_frame import LazyFrame
from polars.internals.lazy_functions import _date as date
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
col,
concat_list,
element,
format,
lit,
select,
)
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
sequence_to_pydf,
series_to_pydf,
)
from polars.internals.functions import PolarsSlice
from polars.internals.slice import PolarsSlice
from polars.utils import (
_prepare_row_count_args,
_process_null_values,
Expand Down
287 changes: 89 additions & 198 deletions py-polars/polars/internals/functions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from __future__ import annotations

from datetime import date, datetime, timedelta
from typing import Sequence, Union, overload
from typing import Optional, Sequence, overload

from polars import internals as pli
from polars.datatypes import Date
from polars.datatypes import Categorical, Date, Float64
from polars.utils import (
_datetime_to_pl_timestamp,
_timedelta_to_pl_duration,
Expand Down Expand Up @@ -264,208 +264,99 @@ def date_range(
return dt_range


FrameOrSeries = Union["pli.DataFrame", "pli.Series"]


class PolarsSlice:
"""
Apply python slice object to Polars DataFrame or Series,
with full support for negative indexing and/or stride.
def cut(
s: pli.Series,
bins: list[float],
labels: Optional[list[str]] = None,
break_point_label: str = "break_point",
category_label: str = "category",
) -> pli.DataFrame:
"""
Bin values into discrete values

stop: int
start: int
stride: int
slice_length: int
is_unbounded: bool
obj: FrameOrSeries

def __init__(self, obj: FrameOrSeries):
self.obj = obj

@staticmethod
def _as_original(lazy: "pli.LazyFrame", original: FrameOrSeries) -> FrameOrSeries:
"""
Return lazy variant back to its original type.
"""
frame = lazy.collect()
return frame if isinstance(original, pli.DataFrame) else frame.to_series()

@staticmethod
def _lazify(obj: FrameOrSeries) -> "pli.LazyFrame":
"""
Make lazy to ensure efficent/consistent handling.
"""
return obj.lazy() if isinstance(obj, pli.DataFrame) else obj.to_frame().lazy()

def _slice_positive(self, obj: "pli.LazyFrame") -> "pli.LazyFrame":
"""
Logic for slices with positive stride.
"""
# note: at this point stride is guaranteed to be > 1
return obj.slice(self.start, self.slice_length).take_every(self.stride)

def _slice_negative(self, obj: "pli.LazyFrame") -> "pli.LazyFrame":
"""
Logic for slices with negative stride.
"""
stride = abs(self.stride)
lazyslice = obj.slice(self.stop + 1, self.slice_length).reverse()
return lazyslice.take_every(stride) if (stride > 1) else lazyslice

def _slice_setup(self, s: slice) -> None:
"""
Normalise slice bounds, identify unbounded and/or zero-length slices.
"""
# can normalise slice indices as we know object size
obj_len = len(self.obj)
start, stop, stride = slice(s.start, s.stop, s.step).indices(obj_len)

# check if slice is actually unbounded
if stride >= 1:
self.is_unbounded = (start <= 0) and (stop >= obj_len)
else:
self.is_unbounded = (stop == -1) and (start >= obj_len - 1)

# determine slice length
if self.obj.is_empty():
self.slice_length = 0
elif self.is_unbounded:
self.slice_length = obj_len
else:
self.slice_length = (
0
if (
(start == stop)
or (stride > 0 and start > stop)
or (stride < 0 and start < stop)
)
else abs(stop - start)
)
self.start, self.stop, self.stride = start, stop, stride

def apply(self, s: slice) -> FrameOrSeries:
"""
Apply a slice operation, taking advantage of any potential fast paths.
"""
# normalise slice
self._slice_setup(s)
.. warning::
This function is experimental and might change without it being considered a breaking change.

# check for fast-paths / single-operation calls
if self.slice_length == 0:
return self.obj.cleared()

elif self.is_unbounded and self.stride in (-1, 1):
return self.obj.reverse() if (self.stride < 0) else self.obj.clone()

elif self.start >= 0 and self.stop >= 0 and self.stride == 1:
return self.obj.slice(self.start, self.slice_length)
Parameters
----------
s
Series to bin.
bins
Bins to create.
labels
Labels to assign to the bins. If given the length of labels must be len(bins) + 1.
break_point_label
Name given to the breakpoint column.
category_label
Name given to the category column.

elif self.stride < 0 and self.slice_length == 1:
return self.obj.slice(self.stop + 1, 1)
else:
# multi-operation calls; make lazy
lazyobj = self._lazify(self.obj)
sliced = (
self._slice_positive(lazyobj)
if self.stride > 0
else self._slice_negative(lazyobj)
)
return self._as_original(sliced, self.obj)
Returns
-------
DataFrame

Examples
--------
>>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)])
>>> pl.cut(a, bins=[-1, 1])
shape: (12, 3)
┌──────┬─────────────┬──────────────┐
│ a ┆ break_point ┆ category │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ cat │
╞══════╪═════════════╪══════════════╡
│ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1.5 ┆ inf ┆ (1.0, inf] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2.0 ┆ inf ┆ (1.0, inf] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2.5 ┆ inf ┆ (1.0, inf] │
└──────┴─────────────┴──────────────┘

class LazyPolarsSlice:
"""
Apply python slice object to Polars LazyFrame. Only slices with efficient
computation paths mapping directly to existing lazy methods are supported.
"""
var_nm = s.name

cuts_df = pli.DataFrame(
[
pli.Series(
name=break_point_label, values=bins, dtype=Float64
).extend_constant(float("inf"), 1)
]
)

if labels:
if len(labels) != len(bins) + 1:
raise ValueError("expected more labels")
cuts_df = cuts_df.with_column(pli.Series(name=category_label, values=labels))
else:
cuts_df = cuts_df.with_column(
pli.format(
"({}, {}]",
pli.col(break_point_label).shift_and_fill(1, float("-inf")),
pli.col(break_point_label),
).alias(category_label)
)

cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical))

obj: "pli.LazyFrame"

def __init__(self, obj: "pli.LazyFrame"):
self.obj = obj

def apply(self, s: slice) -> "pli.LazyFrame":
"""
Apply a slice operation. Note that LazyFrame is designed primarily for efficient
computation and does not know its own length so, unlike DataFrame, certain slice
patterns (such as those requiring negative stop/step) may not be supported.
"""
start = s.start or 0
step = s.step or 1

# fail on operations that require length to do efficiently
if s.stop and s.stop < 0:
raise ValueError("Negative stop is not supported for lazy slices")
if step < 0 and (start > 0 or s.stop is not None) and (start != s.stop):
if not (start > 0 > step and s.stop is None):
raise ValueError(
"Negative stride is not supported in conjunction with start+stop"
)

# ---------------------------------------
# empty slice patterns.
# ---------------------------------------
# [:0]
# [i:<=i]
# [i:>=i:-k]
if step > 0 and (s.stop is not None and start >= s.stop):
return self.obj.cleared()
elif step < 0 and (s.stop is not None and s.stop >= s.start >= 0):
return self.obj.cleared()

# ---------------------------------------
# straight-though mappings for "reverse"
# and/or "take_every"
# ---------------------------------------
# [:] => clone()
# [::k] => take_every(k),
# [::-1] => reverse(),
# [::-k] => reverse().take_every(abs(k))
elif start == 0 and s.stop is None:
if step == 1:
return self.obj.clone()
elif step > 1:
return self.obj.take_every(step)
elif step == -1:
return self.obj.reverse()
elif step < -1:
return self.obj.reverse().take_every(abs(step))

elif start > 0 > step and s.stop is None:
obj = self.obj.head(s.start + 1).reverse()
return obj if (abs(step) == 1) else obj.take_every(abs(step))

# ---------------------------------------
# straight-though mappings for "head"
# ---------------------------------------
# [:j] => head(j)
# [:j:k] => head(j).take_every(k)
elif start == 0 and (s.stop or 0) >= 1:
obj = self.obj.head(s.stop)
return obj if (step == 1) else obj.take_every(step)

# ---------------------------------------
# straight-though mappings for "tail"
# ---------------------------------------
# [-i:] => tail(abs(i))
# [-i::k] => tail(abs(i)).take_every(k)
elif start < 0 and s.stop is None:
obj = self.obj.tail(abs(start))
return obj if (step == 1) else obj.take_every(step)

# ---------------------------------------
# straight-though mappings for "slice"
# ---------------------------------------
# [i:] => slice(i)
# [i:j] => slice(i,j-i)
# [i:j:k] => slice(i,j-i).take_every(k)
elif start > 0 and (s.stop is None or s.stop >= 0):
slice_length = None if (s.stop is None) else (s.stop - start)
obj = self.obj.slice(start, slice_length)
return obj if (step == 1) else obj.take_every(step)

raise ValueError(
f"The given slice {s} is not supported by lazy computation; consider a "
f"more efficient approach, or construct explicitly with other methods"
result = (
s.sort()
.to_frame()
.join_asof(
cuts_df,
left_on=var_nm,
right_on=break_point_label,
strategy="forward",
)
)
return result
2 changes: 1 addition & 1 deletion py-polars/polars/internals/lazy_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from polars import internals as pli
from polars.cfg import Config
from polars.datatypes import DataType, py_type_to_dtype
from polars.internals.functions import LazyPolarsSlice
from polars.internals.slice import LazyPolarsSlice
from polars.utils import (
_in_notebook,
_prepare_row_count_args,
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
sequence_to_pyseries,
series_to_pyseries,
)
from polars.internals.functions import PolarsSlice
from polars.internals.slice import PolarsSlice
from polars.utils import (
_date_to_pl_date,
_datetime_to_pl_timestamp,
Expand Down

0 comments on commit 3668df5

Please sign in to comment.