feat(python): Add "ignore_spaces" to alpha and alphanumeric selec…

…tors, add "ascii_only" to `digit` (#16362)
pola-rs · May 23, 2024 · ceb895e · ceb895e
1 parent 6b1e893
commit ceb895e
Show file tree

Hide file tree

Showing 2 changed files with 205 additions and 50 deletions.
diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py
@@ -3,7 +3,16 @@
 from datetime import timezone
 from functools import reduce
 from operator import or_
-from typing import TYPE_CHECKING, Any, Collection, Literal, Mapping, Sequence, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Collection,
+    Literal,
+    Mapping,
+    NoReturn,
+    Sequence,
+    overload,
+)
 
 from polars import functions as F
 from polars._utils.deprecation import deprecate_nonkeyword_arguments
@@ -71,7 +80,7 @@ def is_selector(obj: Any) -> bool:
 
 def expand_selector(
     target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
-    selector: SelectorType,
+    selector: SelectorType | Expr,
 ) -> tuple[str, ...]:
     """
     Expand a selector to column names with respect to a specific frame or schema target.
@@ -267,9 +276,9 @@ def __hash__(self) -> int:
 
     def __invert__(self) -> Self:
         """Invert the selector."""
-        if hasattr(self, "_attrs"):
+        if is_selector(self):
             inverted = all() - self
-            inverted._repr_override = f"~{self!r}"  # type: ignore[attr-defined]
+            inverted._repr_override = f"~{self!r}"
         else:
             inverted = ~self.as_expr()
         return inverted  # type: ignore[return-value]
@@ -292,7 +301,13 @@ def __repr__(self) -> str:
                 ).rstrip(",")
                 return f"cs.{selector_name}({str_params})"
 
-    def __sub__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
+    @overload  # type: ignore[override]
+    def __sub__(self, other: SelectorType) -> SelectorType: ...
+
+    @overload
+    def __sub__(self, other: Any) -> SelectorType | Expr: ...
+
+    def __sub__(self, other: Any) -> Expr:
         if is_selector(other):
             return _selector_proxy_(
                 self.meta._as_selector().meta._selector_sub(other),
@@ -302,11 +317,17 @@ def __sub__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
         else:
             return self.as_expr().__sub__(other)
 
-    def __rsub__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
+    def __rsub__(self, other: Any) -> NoReturn:
         msg = "unsupported operand type(s) for op: ('Expr' - 'Selector')"
         raise TypeError(msg)
 
-    def __and__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
+    @overload  # type: ignore[override]
+    def __and__(self, other: SelectorType) -> SelectorType: ...
+
+    @overload
+    def __and__(self, other: Any) -> Expr: ...
+
+    def __and__(self, other: Any) -> SelectorType | Expr:
         if is_column(other):
             colname = other.meta.output_name()
             if self._attrs["name"] == "by_name" and (
@@ -323,7 +344,13 @@ def __and__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
         else:
             return self.as_expr().__and__(other)
 
-    def __or__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
+    @overload  # type: ignore[override]
+    def __or__(self, other: SelectorType) -> SelectorType: ...
+
+    @overload
+    def __or__(self, other: Any) -> Expr: ...
+
+    def __or__(self, other: Any) -> SelectorType | Expr:
         if is_column(other):
             other = by_name(other.meta.output_name())
         if is_selector(other):
@@ -335,7 +362,7 @@ def __or__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
         else:
             return self.as_expr().__or__(other)
 
-    def __rand__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
+    def __rand__(self, other: Any) -> Expr:  # type: ignore[override]
         if is_column(other):
             colname = other.meta.output_name()
             if self._attrs["name"] == "by_name" and (
@@ -345,7 +372,7 @@ def __rand__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
             other = by_name(colname)
         return self.as_expr().__rand__(other)
 
-    def __ror__(self, other: Any) -> SelectorType | Expr:  # type: ignore[override]
+    def __ror__(self, other: Any) -> Expr:  # type: ignore[override]
         if is_column(other):
             other = by_name(other.meta.output_name())
         return self.as_expr().__ror__(other)
@@ -465,21 +492,25 @@ def all() -> SelectorType:
     return _selector_proxy_(F.all(), name="all")
 
 
-def alpha(ascii_only: bool = False) -> SelectorType:  # noqa: FBT001
-    """
+def alpha(ascii_only: bool = False, *, ignore_spaces: bool = False) -> SelectorType:  # noqa: FBT001
+    r"""
     Select all columns with alphabetic names (eg: only letters).
 
     Parameters
     ----------
     ascii_only
         Indicate whether to consider only ASCII alphabetic characters, or the full
         Unicode range of valid letters (accented, idiographic, etc).
+    ignore_spaces
+        Indicate whether to ignore the presence of spaces in column names; if so,
+        only the other (non-space) characters are considered.
 
     Notes
     -----
     Matching column names cannot contain *any* non-alphabetic characters. Note
     that the definition of "alphabetic" consists of all valid Unicode alphabetic
-    characters by default; this can be changed by setting `ascii_only=True`.
+    characters (`\p{Alphabetic}`) by default; this can be changed by setting
+    `ascii_only=True`.
 
     Examples
     --------
@@ -489,7 +520,7 @@ def alpha(ascii_only: bool = False) -> SelectorType:  # noqa: FBT001
     ...     {
     ...         "no1": [100, 200, 300],
     ...         "café": ["espresso", "latte", "mocha"],
-    ...         "t/f": [True, False, None],
+    ...         "t or f": [True, False, None],
     ...         "hmm": ["aaa", "bbb", "ccc"],
     ...         "都市": ["東京", "大阪", "京都"],
     ...     }
@@ -524,53 +555,88 @@ def alpha(ascii_only: bool = False) -> SelectorType:  # noqa: FBT001
     │ ccc │
     └─────┘
 
+    >>> df.select(cs.alpha(ascii_only=True, ignore_spaces=True))
+    shape: (3, 2)
+    ┌────────┬─────┐
+    │ t or f ┆ hmm │
+    │ ---    ┆ --- │
+    │ bool   ┆ str │
+    ╞════════╪═════╡
+    │ true   ┆ aaa │
+    │ false  ┆ bbb │
+    │ null   ┆ ccc │
+    └────────┴─────┘
+
     Select all columns *except* for those with alphabetic names:
 
     >>> df.select(~cs.alpha())
     shape: (3, 2)
-    ┌─────┬───────┐
-    │ no1 ┆ t/f   │
-    │ --- ┆ ---   │
-    │ i64 ┆ bool  │
-    ╞═════╪═══════╡
-    │ 100 ┆ true  │
-    │ 200 ┆ false │
-    │ 300 ┆ null  │
-    └─────┴───────┘
+    ┌─────┬────────┐
+    │ no1 ┆ t or f │
+    │ --- ┆ ---    │
+    │ i64 ┆ bool   │
+    ╞═════╪════════╡
+    │ 100 ┆ true   │
+    │ 200 ┆ false  │
+    │ 300 ┆ null   │
+    └─────┴────────┘
+
+    >>> df.select(~cs.alpha(ignore_spaces=True))
+    shape: (3, 1)
+    ┌─────┐
+    │ no1 │
+    │ --- │
+    │ i64 │
+    ╞═════╡
+    │ 100 │
+    │ 200 │
+    │ 300 │
+    └─────┘
     """
+    # note that we need to supply a pattern compatible with the *rust* regex crate
     re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
+    re_space = " " if ignore_spaces else ""
     return _selector_proxy_(
-        F.col(f"^[{re_alpha}]+$"),
+        F.col(f"^[{re_alpha}{re_space}]+$"),
         name="alpha",
-        parameters={"ascii_only": ascii_only},
+        parameters={"ascii_only": ascii_only, "ignore_spaces": ignore_spaces},
     )
 
 
-def alphanumeric(ascii_only: bool = False) -> SelectorType:  # noqa: FBT001
-    """
+def alphanumeric(
+    ascii_only: bool = False,  # noqa: FBT001
+    *,
+    ignore_spaces: bool = False,
+) -> SelectorType:
+    r"""
     Select all columns with alphanumeric names (eg: only letters and the digits 0-9).
 
     Parameters
     ----------
     ascii_only
         Indicate whether to consider only ASCII alphabetic characters, or the full
         Unicode range of valid letters (accented, idiographic, etc).
+    ignore_spaces
+        Indicate whether to ignore the presence of spaces in column names; if so,
+        only the other (non-space) characters are considered.
 
     Notes
     -----
     Matching column names cannot contain *any* non-alphabetic or integer characters.
     Note that the definition of "alphabetic" consists of all valid Unicode alphabetic
-    characters by default; this can be changed by setting `ascii_only=True`.
+    characters (`\p{Alphabetic}`) and digit characters (`\d`) by default; this
+    can be changed by setting `ascii_only=True`.
 
     Examples
     --------
     >>> import polars as pl
     >>> import polars.selectors as cs
     >>> df = pl.DataFrame(
     ...     {
-    ...         "1st_value": [100, 200, 300],
+    ...         "1st_col": [100, 200, 300],
     ...         "flagged": [True, False, True],
     ...         "00prefix": ["01:aa", "02:bb", "03:cc"],
+    ...         "last col": ["x", "y", "z"],
     ...     }
     ... )
 
@@ -588,25 +654,52 @@ def alphanumeric(ascii_only: bool = False) -> SelectorType:  # noqa: FBT001
     │ true    ┆ 03:cc    │
     └─────────┴──────────┘
 
+    >>> df.select(cs.alphanumeric(ignore_spaces=True))
+    shape: (3, 3)
+    ┌─────────┬──────────┬──────────┐
+    │ flagged ┆ 00prefix ┆ last col │
+    │ ---     ┆ ---      ┆ ---      │
+    │ bool    ┆ str      ┆ str      │
+    ╞═════════╪══════════╪══════════╡
+    │ true    ┆ 01:aa    ┆ x        │
+    │ false   ┆ 02:bb    ┆ y        │
+    │ true    ┆ 03:cc    ┆ z        │
+    └─────────┴──────────┴──────────┘
+
     Select all columns *except* for those with alphanumeric names:
 
     >>> df.select(~cs.alphanumeric())
+    shape: (3, 2)
+    ┌─────────┬──────────┐
+    │ 1st_col ┆ last col │
+    │ ---     ┆ ---      │
+    │ i64     ┆ str      │
+    ╞═════════╪══════════╡
+    │ 100     ┆ x        │
+    │ 200     ┆ y        │
+    │ 300     ┆ z        │
+    └─────────┴──────────┘
+
+    >>> df.select(~cs.alphanumeric(ignore_spaces=True))
     shape: (3, 1)
-    ┌───────────┐
-    │ 1st_value │
-    │ ---       │
-    │ i64       │
-    ╞═══════════╡
-    │ 100       │
-    │ 200       │
-    │ 300       │
-    └───────────┘
-    """
+    ┌─────────┐
+    │ 1st_col │
+    │ ---     │
+    │ i64     │
+    ╞═════════╡
+    │ 100     │
+    │ 200     │
+    │ 300     │
+    └─────────┘
+    """
+    # note that we need to supply patterns compatible with the *rust* regex crate
     re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
+    re_digit = "0-9" if ascii_only else r"\d"
+    re_space = " " if ignore_spaces else ""
     return _selector_proxy_(
-        F.col(f"^[{re_alpha}0-9]+$"),
+        F.col(f"^[{re_alpha}{re_digit}{re_space}]+$"),
         name="alphanumeric",
-        parameters={"ascii_only": ascii_only},
+        parameters={"ascii_only": ascii_only, "ignore_spaces": ignore_spaces},
     )
 
 
@@ -1363,9 +1456,15 @@ def decimal() -> SelectorType:
     return _selector_proxy_(F.col(Decimal), name="decimal")
 
 
-def digit() -> SelectorType:
-    """
-    Select all columns having names consisting only of digits (eg: integer 0-9).
+def digit(ascii_only: bool = False) -> SelectorType:  # noqa: FBT001
+    r"""
+    Select all columns having names consisting only of digits.
+
+    Notes
+    -----
+    Matching column names cannot contain *any* non-digit characters. Note that the
+    definition of "digit" consists of all valid Unicode digit characters (`\d`)
+    by default; this can be changed by setting `ascii_only=True`.
 
     Examples
     --------
@@ -1419,8 +1518,33 @@ def digit() -> SelectorType:
     │ aaa │
     │ bbb │
     └─────┘
+
+    Demonstrate use of `ascii_only` flag (by default all valid unicode digits
+    are considered, but this can be constrained to ascii 0-9):
+
+    >>> df = pl.DataFrame({"१९९९": [1999], "२०७७": [2077], "3000": [3000]})
+    >>> df.select(cs.digit())
+    shape: (1, 3)
+    ┌──────┬──────┬──────┐
+    │ १९९९ ┆ २०७७ ┆ 3000 │
+    │ ---  ┆ ---  ┆ ---  │
+    │ i64  ┆ i64  ┆ i64  │
+    ╞══════╪══════╪══════╡
+    │ 1999 ┆ 2077 ┆ 3000 │
+    └──────┴──────┴──────┘
+
+    >>> df.select(cs.digit(ascii_only=True))
+    shape: (1, 1)
+    ┌──────┐
+    │ 3000 │
+    │ ---  │
+    │ i64  │
+    ╞══════╡
+    │ 3000 │
+    └──────┘
     """
-    return _selector_proxy_(F.col(r"^\d+$"), name="digit")
+    re_digit = r"[0-9]" if ascii_only else r"\d"
+    return _selector_proxy_(F.col(rf"^{re_digit}+$"), name="digit")
 
 
 def duration(