Skip to content

Commit

Permalink
Improve fill_null usability (#4324)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Aug 8, 2022
1 parent 796a5ab commit 4dd63dd
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 113 deletions.
1 change: 1 addition & 0 deletions py-polars/polars/internals/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
IntoExpr = Union[int, float, str, "pli.Expr", "pli.Series"]

ClosedWindow = Literal["left", "right", "both", "none"]
FillStrategy = Literal["forward", "backward", "min", "max", "mean", "zero", "one"]
InterpolationMethod = Literal["nearest", "higher", "lower", "midpoint", "linear"]
52 changes: 29 additions & 23 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@
from typing_extensions import Literal

if TYPE_CHECKING:
from polars.internals.datatypes import ClosedWindow, InterpolationMethod
from polars.internals.datatypes import (
ClosedWindow,
FillStrategy,
InterpolationMethod,
)


def selection_to_pyexpr_list(
Expand Down Expand Up @@ -1880,25 +1884,27 @@ def shift_and_fill(

def fill_null(
self,
fill_value: int | float | bool | str | Expr,
value: Any | None = None,
strategy: FillStrategy | None = None,
limit: int | None = None,
) -> Expr:
"""
Fill null values using a filling strategy, literal, or Expr.
Fill null values using the specified value or strategy.
Parameters
----------
fill_value
One of {"backward", "forward", "min", "max", "mean", "one", "zero"}
or an expression.
value
Value used to fill null values.
strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
Strategy used to fill null values.
limit
The number of consecutive null values to forward/backward fill.
Only valid if ``fill_value`` is 'forward' or 'backward'.
Number of consecutive null values to fill when using the 'forward' or
'backward' strategy.
Examples
--------
>>> df = pl.DataFrame({"a": [1, 2, None], "b": [4, None, 6]})
>>> df.fill_null("zero")
>>> df.fill_null(strategy="zero")
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
Expand Down Expand Up @@ -1926,21 +1932,21 @@ def fill_null(
└─────┴─────┘
"""
# we first must check if it is not an expr, as expr does not implement __bool__
# and thus leads to a value error in the second comparison.
if not isinstance(fill_value, Expr) and fill_value in [
"backward",
"forward",
"min",
"max",
"mean",
"zero",
"one",
]:
return wrap_expr(self._pyexpr.fill_null_with_strategy(fill_value, limit))
if value is not None and strategy is not None:
raise ValueError("cannot specify both 'value' and 'strategy'.")
elif value is None and strategy is None:
raise ValueError("must specify either a fill 'value' or 'strategy'")
elif strategy not in ("forward", "backward") and limit is not None:
raise ValueError(
"can only specify 'limit' when strategy is set to"
" 'backward' or 'forward'"
)

fill_value = expr_to_lit_or_expr(fill_value, str_to_lit=True)
return wrap_expr(self._pyexpr.fill_null(fill_value._pyexpr))
if value is not None:
value = expr_to_lit_or_expr(value, str_to_lit=True)
return wrap_expr(self._pyexpr.fill_null(value._pyexpr))
else:
return wrap_expr(self._pyexpr.fill_null_with_strategy(strategy, limit))

def fill_nan(self, fill_value: str | int | float | bool | Expr) -> Expr:
"""
Expand Down
32 changes: 15 additions & 17 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@
DF = TypeVar("DF", bound="DataFrame")

if TYPE_CHECKING:
from polars.internals.datatypes import ClosedWindow, InterpolationMethod
from polars.internals.datatypes import (
ClosedWindow,
FillStrategy,
InterpolationMethod,
)

# these aliases are used to annotate DataFrame.__getitem__()
# MultiRowSelector indexes into the vertical axis and
Expand Down Expand Up @@ -4387,24 +4391,22 @@ def get_column(self, name: str) -> pli.Series:

def fill_null(
self,
strategy: (
Literal["backward", "forward", "min", "max", "mean", "zero", "one"]
| pli.Expr
| Any
),
value: Any | None = None,
strategy: FillStrategy | None = None,
limit: int | None = None,
) -> DataFrame:
"""
Fill null values using a filling strategy, literal, or Expr.
Fill null values using the specified value or strategy.
Parameters
----------
strategy
One of {'backward', 'forward', 'min', 'max', 'mean', 'zero', 'one'}
or an expression.
value
Value used to fill null values.
strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
Strategy used to fill null values.
limit
The number of consecutive null values to forward/backward fill.
Only valid if ``strategy`` is 'forward' or 'backward'.
Number of consecutive null values to fill when using the 'forward' or
'backward' strategy.
Returns
-------
Expand Down Expand Up @@ -4439,11 +4441,7 @@ def fill_null(
└─────┴──────┘
"""
if isinstance(strategy, pli.Expr):
return self.lazy().fill_null(strategy).collect(no_optimization=True)
if not isinstance(strategy, str):
return self.fill_null(pli.lit(strategy))
return self._from_pydf(self._df.fill_null(strategy, limit))
return self.select(pli.all().fill_null(value, strategy, limit))

def fill_nan(self, fill_value: pli.Expr | int | float) -> DataFrame:
"""
Expand Down
28 changes: 20 additions & 8 deletions py-polars/polars/internals/lazy_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@
_PYARROW_AVAILABLE = False

if TYPE_CHECKING:
from polars.internals.datatypes import ClosedWindow, InterpolationMethod
from polars.internals.datatypes import (
ClosedWindow,
FillStrategy,
InterpolationMethod,
)


# Used to type any type or subclass of LazyFrame.
Expand Down Expand Up @@ -1981,19 +1985,27 @@ def take_every(self: LDF, n: int) -> LDF:
"""
return self.select(pli.col("*").take_every(n))

def fill_null(self: LDF, fill_value: int | str | pli.Expr) -> LDF:
def fill_null(
self: LDF,
value: Any | None = None,
strategy: FillStrategy | None = None,
limit: int | None = None,
) -> LDF:
"""
Fill missing values with a literal or Expr.
Fill null values using the specified value or strategy.
Parameters
----------
fill_value
Value to fill the missing values with.
value
Value used to fill null values.
strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
Strategy used to fill null values.
limit
Number of consecutive null values to fill when using the 'forward' or
'backward' strategy.
"""
if not isinstance(fill_value, pli.Expr):
fill_value = pli.lit(fill_value)
return self._from_pyldf(self._ldf.fill_null(fill_value._pyexpr))
return self.select(pli.all().fill_null(value, strategy, limit))

def fill_nan(self: LDF, fill_value: int | str | float | pli.Expr) -> LDF:
"""
Expand Down
34 changes: 15 additions & 19 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
from typing_extensions import Literal

if TYPE_CHECKING:
from polars.internals.datatypes import InterpolationMethod
from polars.internals.datatypes import FillStrategy, InterpolationMethod


def get_ffi_func(
Expand Down Expand Up @@ -2599,29 +2599,27 @@ def fill_nan(self, fill_value: str | int | float | bool | pli.Expr) -> Series:

def fill_null(
self,
strategy: (
Literal["backward", "forward", "min", "max", "mean", "zero", "one"]
| pli.Expr
| Any
),
value: Any | None = None,
strategy: FillStrategy | None = None,
limit: int | None = None,
) -> Series:
"""
Fill null values using a filling strategy, literal, or Expr.
Fill null values using the specified value or strategy.
Parameters
----------
strategy
One of {'backward', 'forward', 'min', 'max', 'mean', 'zero', 'one'}
or an expression.
value
Value used to fill null values.
strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
Strategy used to fill null values.
limit
The number of consecutive null values to forward/backward fill.
Only valid if ``strategy`` is 'forward' or 'backward'.
Number of consecutive null values to fill when using the 'forward' or
'backward' strategy.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, None])
>>> s.fill_null("forward")
>>> s.fill_null(strategy="forward")
shape: (4,)
Series: 'a' [i64]
[
Expand All @@ -2630,7 +2628,7 @@ def fill_null(
3
3
]
>>> s.fill_null("min")
>>> s.fill_null(strategy="min")
shape: (4,)
Series: 'a' [i64]
[
Expand All @@ -2650,11 +2648,9 @@ def fill_null(
]
"""
if not isinstance(strategy, str):
return self.to_frame().select(pli.col(self.name).fill_null(strategy))[
self.name
]
return wrap_s(self._s.fill_null(strategy, limit))
return self.to_frame().select(
pli.col(self.name).fill_null(value, strategy, limit)
)[self.name]

def floor(self) -> Series:
"""
Expand Down
40 changes: 17 additions & 23 deletions py-polars/src/conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -844,30 +844,24 @@ pub(crate) fn dicts_to_rows(records: &PyAny) -> PyResult<(Vec<Row>, Vec<String>)
}

pub(crate) fn parse_strategy(strat: &str, limit: FillNullLimit) -> PyResult<FillNullStrategy> {
if limit.is_some() && strat != "forward" && strat != "backward" {
Err(PyValueError::new_err(
"'limit' argument in 'fill_null' only allowed for {'forward', 'backward'} strategies",
))
} else {
let strat = match strat {
"backward" => FillNullStrategy::Backward(limit),
"forward" => FillNullStrategy::Forward(limit),
"min" => FillNullStrategy::Min,
"max" => FillNullStrategy::Max,
"mean" => FillNullStrategy::Mean,
"zero" => FillNullStrategy::Zero,
"one" => FillNullStrategy::One,
e => {
return Err(PyValueError::new_err(format!(
"strategy must be one of {{'backward', 'forward', 'min', 'max', 'mean', 'zero', 'one'}}, got {}",
e,
)))
}
};

Ok(strat)
}
let strat = match strat {
"forward" => FillNullStrategy::Forward(limit),
"backward" => FillNullStrategy::Backward(limit),
"min" => FillNullStrategy::Min,
"max" => FillNullStrategy::Max,
"mean" => FillNullStrategy::Mean,
"zero" => FillNullStrategy::Zero,
"one" => FillNullStrategy::One,
e => {
return Err(PyValueError::new_err(format!(
"strategy must be one of {{'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}}, got {}",
e,
)))
}
};
Ok(strat)
}

#[cfg(feature = "parquet")]
impl FromPyObject<'_> for Wrap<ParallelStrategy> {
fn extract(ob: &PyAny) -> PyResult<Self> {
Expand Down
8 changes: 1 addition & 7 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::apply::dataframe::{
apply_lambda_unknown, apply_lambda_with_bool_out_type, apply_lambda_with_primitive_out_type,
apply_lambda_with_utf8_out_type,
};
use crate::conversion::{parse_strategy, ObjectValue, Wrap};
use crate::conversion::{ObjectValue, Wrap};
use crate::file::get_mmap_bytes_reader;
use crate::lazy::dataframe::PyLazyFrame;
use crate::prelude::{dicts_to_rows, str_to_null_strategy};
Expand Down Expand Up @@ -796,12 +796,6 @@ impl PyDataFrame {
format!("{:?}", self.df)
}

pub fn fill_null(&self, strategy: &str, limit: FillNullLimit) -> PyResult<Self> {
let strat = parse_strategy(strategy, limit)?;
let df = self.df.fill_null(strat).map_err(PyPolarsErr::from)?;
Ok(PyDataFrame::new(df))
}

pub fn join(
&self,
other: &PyDataFrame,
Expand Down
5 changes: 0 additions & 5 deletions py-polars/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,11 +600,6 @@ impl PyLazyFrame {
ldf.shift_and_fill(periods, fill_value.inner).into()
}

pub fn fill_null(&self, fill_value: PyExpr) -> Self {
let ldf = self.ldf.clone();
ldf.fill_null(fill_value.inner).into()
}

pub fn fill_nan(&self, fill_value: PyExpr) -> Self {
let ldf = self.ldf.clone();
ldf.fill_nan(fill_value.inner).into()
Expand Down
6 changes: 0 additions & 6 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -863,12 +863,6 @@ impl PySeries {
self.series.drop_nulls().into()
}

pub fn fill_null(&self, strategy: &str, limit: Option<IdxSize>) -> PyResult<Self> {
let strat = parse_strategy(strategy, limit)?;
let series = self.series.fill_null(strat).map_err(PyPolarsErr::from)?;
Ok(PySeries::new(series))
}

pub fn to_arrow(&mut self) -> PyResult<PyObject> {
self.rechunk(true);
let gil = Python::acquire_gil();
Expand Down
8 changes: 5 additions & 3 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1548,7 +1548,9 @@ def test_rename_same_name() -> None:
def test_fill_null() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3, None]})
assert df.fill_null(4).frame_equal(pl.DataFrame({"a": [1, 2], "b": [3, 4]}))
assert df.fill_null("max").frame_equal(pl.DataFrame({"a": [1, 2], "b": [3, 3]}))
assert df.fill_null(strategy="max").frame_equal(
pl.DataFrame({"a": [1, 2], "b": [3, 3]})
)


def test_fill_nan() -> None:
Expand Down Expand Up @@ -1988,8 +1990,8 @@ def test_fill_null_limits() -> None:
}
).select(
[
pl.all().fill_null("forward", limit=2),
pl.all().fill_null("backward", limit=2).suffix("_backward"),
pl.all().fill_null(strategy="forward", limit=2),
pl.all().fill_null(strategy="backward", limit=2).suffix("_backward"),
]
).to_dict(
False
Expand Down

0 comments on commit 4dd63dd

Please sign in to comment.