Improve fill_null usability (#4324)

pola-rs · Aug 8, 2022 · 4dd63dd · 4dd63dd
1 parent 796a5ab
commit 4dd63dd
Show file tree

Hide file tree

Showing 12 changed files with 115 additions and 113 deletions.
diff --git a/py-polars/polars/internals/datatypes.py b/py-polars/polars/internals/datatypes.py
@@ -13,4 +13,5 @@
 IntoExpr = Union[int, float, str, "pli.Expr", "pli.Series"]
 
 ClosedWindow = Literal["left", "right", "both", "none"]
+FillStrategy = Literal["forward", "backward", "min", "max", "mean", "zero", "one"]
 InterpolationMethod = Literal["nearest", "higher", "lower", "midpoint", "linear"]
diff --git a/py-polars/polars/internals/expr.py b/py-polars/polars/internals/expr.py
@@ -42,7 +42,11 @@
     from typing_extensions import Literal
 
 if TYPE_CHECKING:
-    from polars.internals.datatypes import ClosedWindow, InterpolationMethod
+    from polars.internals.datatypes import (
+        ClosedWindow,
+        FillStrategy,
+        InterpolationMethod,
+    )
 
 
 def selection_to_pyexpr_list(
@@ -1880,25 +1884,27 @@ def shift_and_fill(
 
     def fill_null(
         self,
-        fill_value: int | float | bool | str | Expr,
+        value: Any | None = None,
+        strategy: FillStrategy | None = None,
         limit: int | None = None,
     ) -> Expr:
         """
-        Fill null values using a filling strategy, literal, or Expr.
+        Fill null values using the specified value or strategy.
 
         Parameters
         ----------
-        fill_value
-            One of {"backward", "forward", "min", "max", "mean", "one", "zero"}
-            or an expression.
+        value
+            Value used to fill null values.
+        strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
+            Strategy used to fill null values.
         limit
-            The number of consecutive null values to forward/backward fill.
-            Only valid if ``fill_value`` is 'forward' or 'backward'.
+            Number of consecutive null values to fill when using the 'forward' or
+            'backward' strategy.
 
         Examples
         --------
         >>> df = pl.DataFrame({"a": [1, 2, None], "b": [4, None, 6]})
-        >>> df.fill_null("zero")
+        >>> df.fill_null(strategy="zero")
         shape: (3, 2)
         ┌─────┬─────┐
         │ a   ┆ b   │
@@ -1926,21 +1932,21 @@ def fill_null(
         └─────┴─────┘
 
         """
-        # we first must check if it is not an expr, as expr does not implement __bool__
-        # and thus leads to a value error in the second comparison.
-        if not isinstance(fill_value, Expr) and fill_value in [
-            "backward",
-            "forward",
-            "min",
-            "max",
-            "mean",
-            "zero",
-            "one",
-        ]:
-            return wrap_expr(self._pyexpr.fill_null_with_strategy(fill_value, limit))
+        if value is not None and strategy is not None:
+            raise ValueError("cannot specify both 'value' and 'strategy'.")
+        elif value is None and strategy is None:
+            raise ValueError("must specify either a fill 'value' or 'strategy'")
+        elif strategy not in ("forward", "backward") and limit is not None:
+            raise ValueError(
+                "can only specify 'limit' when strategy is set to"
+                " 'backward' or 'forward'"
+            )
 
-        fill_value = expr_to_lit_or_expr(fill_value, str_to_lit=True)
-        return wrap_expr(self._pyexpr.fill_null(fill_value._pyexpr))
+        if value is not None:
+            value = expr_to_lit_or_expr(value, str_to_lit=True)
+            return wrap_expr(self._pyexpr.fill_null(value._pyexpr))
+        else:
+            return wrap_expr(self._pyexpr.fill_null_with_strategy(strategy, limit))
 
     def fill_nan(self, fill_value: str | int | float | bool | Expr) -> Expr:
         """

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -108,7 +108,11 @@
 DF = TypeVar("DF", bound="DataFrame")
 
 if TYPE_CHECKING:
-    from polars.internals.datatypes import ClosedWindow, InterpolationMethod
+    from polars.internals.datatypes import (
+        ClosedWindow,
+        FillStrategy,
+        InterpolationMethod,
+    )
 
     # these aliases are used to annotate DataFrame.__getitem__()
     # MultiRowSelector indexes into the vertical axis and
@@ -4387,24 +4391,22 @@ def get_column(self, name: str) -> pli.Series:
 
     def fill_null(
         self,
-        strategy: (
-            Literal["backward", "forward", "min", "max", "mean", "zero", "one"]
-            | pli.Expr
-            | Any
-        ),
+        value: Any | None = None,
+        strategy: FillStrategy | None = None,
         limit: int | None = None,
     ) -> DataFrame:
         """
-        Fill null values using a filling strategy, literal, or Expr.
+        Fill null values using the specified value or strategy.
 
         Parameters
         ----------
-        strategy
-            One of {'backward', 'forward', 'min', 'max', 'mean', 'zero', 'one'}
-            or an expression.
+        value
+            Value used to fill null values.
+        strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
+            Strategy used to fill null values.
         limit
-            The number of consecutive null values to forward/backward fill.
-            Only valid if ``strategy`` is 'forward' or 'backward'.
+            Number of consecutive null values to fill when using the 'forward' or
+            'backward' strategy.
 
         Returns
         -------
@@ -4439,11 +4441,7 @@ def fill_null(
         └─────┴──────┘
 
         """
-        if isinstance(strategy, pli.Expr):
-            return self.lazy().fill_null(strategy).collect(no_optimization=True)
-        if not isinstance(strategy, str):
-            return self.fill_null(pli.lit(strategy))
-        return self._from_pydf(self._df.fill_null(strategy, limit))
+        return self.select(pli.all().fill_null(value, strategy, limit))
 
     def fill_nan(self, fill_value: pli.Expr | int | float) -> DataFrame:
         """

diff --git a/py-polars/polars/internals/lazy_frame.py b/py-polars/polars/internals/lazy_frame.py
@@ -50,7 +50,11 @@
     _PYARROW_AVAILABLE = False
 
 if TYPE_CHECKING:
-    from polars.internals.datatypes import ClosedWindow, InterpolationMethod
+    from polars.internals.datatypes import (
+        ClosedWindow,
+        FillStrategy,
+        InterpolationMethod,
+    )
 
 
 # Used to type any type or subclass of LazyFrame.
@@ -1981,19 +1985,27 @@ def take_every(self: LDF, n: int) -> LDF:
         """
         return self.select(pli.col("*").take_every(n))
 
-    def fill_null(self: LDF, fill_value: int | str | pli.Expr) -> LDF:
+    def fill_null(
+        self: LDF,
+        value: Any | None = None,
+        strategy: FillStrategy | None = None,
+        limit: int | None = None,
+    ) -> LDF:
         """
-        Fill missing values with a literal or Expr.
+        Fill null values using the specified value or strategy.
 
         Parameters
         ----------
-        fill_value
-            Value to fill the missing values with.
+        value
+            Value used to fill null values.
+        strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
+            Strategy used to fill null values.
+        limit
+            Number of consecutive null values to fill when using the 'forward' or
+            'backward' strategy.
 
         """
-        if not isinstance(fill_value, pli.Expr):
-            fill_value = pli.lit(fill_value)
-        return self._from_pyldf(self._ldf.fill_null(fill_value._pyexpr))
+        return self.select(pli.all().fill_null(value, strategy, limit))
 
     def fill_nan(self: LDF, fill_value: int | str | float | pli.Expr) -> LDF:
         """

diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py
@@ -87,7 +87,7 @@
     from typing_extensions import Literal
 
 if TYPE_CHECKING:
-    from polars.internals.datatypes import InterpolationMethod
+    from polars.internals.datatypes import FillStrategy, InterpolationMethod
 
 
 def get_ffi_func(
@@ -2599,29 +2599,27 @@ def fill_nan(self, fill_value: str | int | float | bool | pli.Expr) -> Series:
 
     def fill_null(
         self,
-        strategy: (
-            Literal["backward", "forward", "min", "max", "mean", "zero", "one"]
-            | pli.Expr
-            | Any
-        ),
+        value: Any | None = None,
+        strategy: FillStrategy | None = None,
         limit: int | None = None,
     ) -> Series:
         """
-        Fill null values using a filling strategy, literal, or Expr.
+        Fill null values using the specified value or strategy.
 
         Parameters
         ----------
-        strategy
-            One of {'backward', 'forward', 'min', 'max', 'mean', 'zero', 'one'}
-            or an expression.
+        value
+            Value used to fill null values.
+        strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
+            Strategy used to fill null values.
         limit
-            The number of consecutive null values to forward/backward fill.
-            Only valid if ``strategy`` is 'forward' or 'backward'.
+            Number of consecutive null values to fill when using the 'forward' or
+            'backward' strategy.
 
         Examples
         --------
         >>> s = pl.Series("a", [1, 2, 3, None])
-        >>> s.fill_null("forward")
+        >>> s.fill_null(strategy="forward")
         shape: (4,)
         Series: 'a' [i64]
         [
@@ -2630,7 +2628,7 @@ def fill_null(
             3
             3
         ]
-        >>> s.fill_null("min")
+        >>> s.fill_null(strategy="min")
         shape: (4,)
         Series: 'a' [i64]
         [
@@ -2650,11 +2648,9 @@ def fill_null(
         ]
 
         """
-        if not isinstance(strategy, str):
-            return self.to_frame().select(pli.col(self.name).fill_null(strategy))[
-                self.name
-            ]
-        return wrap_s(self._s.fill_null(strategy, limit))
+        return self.to_frame().select(
+            pli.col(self.name).fill_null(value, strategy, limit)
+        )[self.name]
 
     def floor(self) -> Series:
         """

diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs
@@ -844,30 +844,24 @@ pub(crate) fn dicts_to_rows(records: &PyAny) -> PyResult<(Vec<Row>, Vec<String>)
 }
 
 pub(crate) fn parse_strategy(strat: &str, limit: FillNullLimit) -> PyResult<FillNullStrategy> {
-    if limit.is_some() && strat != "forward" && strat != "backward" {
-        Err(PyValueError::new_err(
-            "'limit' argument in 'fill_null' only allowed for {'forward', 'backward'} strategies",
-        ))
-    } else {
-        let strat = match strat {
-            "backward" => FillNullStrategy::Backward(limit),
-            "forward" => FillNullStrategy::Forward(limit),
-            "min" => FillNullStrategy::Min,
-            "max" => FillNullStrategy::Max,
-            "mean" => FillNullStrategy::Mean,
-            "zero" => FillNullStrategy::Zero,
-            "one" => FillNullStrategy::One,
-            e => {
-                return Err(PyValueError::new_err(format!(
-                    "strategy must be one of {{'backward', 'forward', 'min', 'max', 'mean', 'zero', 'one'}}, got {}",
-                    e,
-                )))
-            }
-        };
-
-        Ok(strat)
-    }
+    let strat = match strat {
+        "forward" => FillNullStrategy::Forward(limit),
+        "backward" => FillNullStrategy::Backward(limit),
+        "min" => FillNullStrategy::Min,
+        "max" => FillNullStrategy::Max,
+        "mean" => FillNullStrategy::Mean,
+        "zero" => FillNullStrategy::Zero,
+        "one" => FillNullStrategy::One,
+        e => {
+            return Err(PyValueError::new_err(format!(
+                "strategy must be one of {{'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}}, got {}",
+                e,
+            )))
+        }
+    };
+    Ok(strat)
 }
+
 #[cfg(feature = "parquet")]
 impl FromPyObject<'_> for Wrap<ParallelStrategy> {
     fn extract(ob: &PyAny) -> PyResult<Self> {

diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
@@ -12,7 +12,7 @@ use crate::apply::dataframe::{
     apply_lambda_unknown, apply_lambda_with_bool_out_type, apply_lambda_with_primitive_out_type,
     apply_lambda_with_utf8_out_type,
 };
-use crate::conversion::{parse_strategy, ObjectValue, Wrap};
+use crate::conversion::{ObjectValue, Wrap};
 use crate::file::get_mmap_bytes_reader;
 use crate::lazy::dataframe::PyLazyFrame;
 use crate::prelude::{dicts_to_rows, str_to_null_strategy};
@@ -796,12 +796,6 @@ impl PyDataFrame {
         format!("{:?}", self.df)
     }
 
-    pub fn fill_null(&self, strategy: &str, limit: FillNullLimit) -> PyResult<Self> {
-        let strat = parse_strategy(strategy, limit)?;
-        let df = self.df.fill_null(strat).map_err(PyPolarsErr::from)?;
-        Ok(PyDataFrame::new(df))
-    }
-
     pub fn join(
         &self,
         other: &PyDataFrame,

diff --git a/py-polars/src/lazy/dataframe.rs b/py-polars/src/lazy/dataframe.rs
@@ -600,11 +600,6 @@ impl PyLazyFrame {
         ldf.shift_and_fill(periods, fill_value.inner).into()
     }
 
-    pub fn fill_null(&self, fill_value: PyExpr) -> Self {
-        let ldf = self.ldf.clone();
-        ldf.fill_null(fill_value.inner).into()
-    }
-
     pub fn fill_nan(&self, fill_value: PyExpr) -> Self {
         let ldf = self.ldf.clone();
         ldf.fill_nan(fill_value.inner).into()

diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs
@@ -863,12 +863,6 @@ impl PySeries {
         self.series.drop_nulls().into()
     }
 
-    pub fn fill_null(&self, strategy: &str, limit: Option<IdxSize>) -> PyResult<Self> {
-        let strat = parse_strategy(strategy, limit)?;
-        let series = self.series.fill_null(strat).map_err(PyPolarsErr::from)?;
-        Ok(PySeries::new(series))
-    }
-
     pub fn to_arrow(&mut self) -> PyResult<PyObject> {
         self.rechunk(true);
         let gil = Python::acquire_gil();

diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -1548,7 +1548,9 @@ def test_rename_same_name() -> None:
 def test_fill_null() -> None:
     df = pl.DataFrame({"a": [1, 2], "b": [3, None]})
     assert df.fill_null(4).frame_equal(pl.DataFrame({"a": [1, 2], "b": [3, 4]}))
-    assert df.fill_null("max").frame_equal(pl.DataFrame({"a": [1, 2], "b": [3, 3]}))
+    assert df.fill_null(strategy="max").frame_equal(
+        pl.DataFrame({"a": [1, 2], "b": [3, 3]})
+    )
 
 
 def test_fill_nan() -> None:
@@ -1988,8 +1990,8 @@ def test_fill_null_limits() -> None:
         }
     ).select(
         [
-            pl.all().fill_null("forward", limit=2),
-            pl.all().fill_null("backward", limit=2).suffix("_backward"),
+            pl.all().fill_null(strategy="forward", limit=2),
+            pl.all().fill_null(strategy="backward", limit=2).suffix("_backward"),
         ]
     ).to_dict(
         False