Skip to content

Commit

Permalink
feat(python): Add "ignore_spaces" to alpha and alphanumeric selec…
Browse files Browse the repository at this point in the history
…tors, add "ascii_only" to `digit` (#16362)
  • Loading branch information
alexander-beedie committed May 23, 2024
1 parent 6b1e893 commit ceb895e
Show file tree
Hide file tree
Showing 2 changed files with 205 additions and 50 deletions.
214 changes: 169 additions & 45 deletions py-polars/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@
from datetime import timezone
from functools import reduce
from operator import or_
from typing import TYPE_CHECKING, Any, Collection, Literal, Mapping, Sequence, overload
from typing import (
TYPE_CHECKING,
Any,
Collection,
Literal,
Mapping,
NoReturn,
Sequence,
overload,
)

from polars import functions as F
from polars._utils.deprecation import deprecate_nonkeyword_arguments
Expand Down Expand Up @@ -71,7 +80,7 @@ def is_selector(obj: Any) -> bool:

def expand_selector(
target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
selector: SelectorType,
selector: SelectorType | Expr,
) -> tuple[str, ...]:
"""
Expand a selector to column names with respect to a specific frame or schema target.
Expand Down Expand Up @@ -267,9 +276,9 @@ def __hash__(self) -> int:

def __invert__(self) -> Self:
"""Invert the selector."""
if hasattr(self, "_attrs"):
if is_selector(self):
inverted = all() - self
inverted._repr_override = f"~{self!r}" # type: ignore[attr-defined]
inverted._repr_override = f"~{self!r}"
else:
inverted = ~self.as_expr()
return inverted # type: ignore[return-value]
Expand All @@ -292,7 +301,13 @@ def __repr__(self) -> str:
).rstrip(",")
return f"cs.{selector_name}({str_params})"

def __sub__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
@overload # type: ignore[override]
def __sub__(self, other: SelectorType) -> SelectorType: ...

@overload
def __sub__(self, other: Any) -> SelectorType | Expr: ...

def __sub__(self, other: Any) -> Expr:
if is_selector(other):
return _selector_proxy_(
self.meta._as_selector().meta._selector_sub(other),
Expand All @@ -302,11 +317,17 @@ def __sub__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
else:
return self.as_expr().__sub__(other)

def __rsub__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
def __rsub__(self, other: Any) -> NoReturn:
msg = "unsupported operand type(s) for op: ('Expr' - 'Selector')"
raise TypeError(msg)

def __and__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
@overload # type: ignore[override]
def __and__(self, other: SelectorType) -> SelectorType: ...

@overload
def __and__(self, other: Any) -> Expr: ...

def __and__(self, other: Any) -> SelectorType | Expr:
if is_column(other):
colname = other.meta.output_name()
if self._attrs["name"] == "by_name" and (
Expand All @@ -323,7 +344,13 @@ def __and__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
else:
return self.as_expr().__and__(other)

def __or__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
@overload # type: ignore[override]
def __or__(self, other: SelectorType) -> SelectorType: ...

@overload
def __or__(self, other: Any) -> Expr: ...

def __or__(self, other: Any) -> SelectorType | Expr:
if is_column(other):
other = by_name(other.meta.output_name())
if is_selector(other):
Expand All @@ -335,7 +362,7 @@ def __or__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
else:
return self.as_expr().__or__(other)

def __rand__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
def __rand__(self, other: Any) -> Expr: # type: ignore[override]
if is_column(other):
colname = other.meta.output_name()
if self._attrs["name"] == "by_name" and (
Expand All @@ -345,7 +372,7 @@ def __rand__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
other = by_name(colname)
return self.as_expr().__rand__(other)

def __ror__(self, other: Any) -> SelectorType | Expr: # type: ignore[override]
def __ror__(self, other: Any) -> Expr: # type: ignore[override]
if is_column(other):
other = by_name(other.meta.output_name())
return self.as_expr().__ror__(other)
Expand Down Expand Up @@ -465,21 +492,25 @@ def all() -> SelectorType:
return _selector_proxy_(F.all(), name="all")


def alpha(ascii_only: bool = False) -> SelectorType: # noqa: FBT001
"""
def alpha(ascii_only: bool = False, *, ignore_spaces: bool = False) -> SelectorType: # noqa: FBT001
r"""
Select all columns with alphabetic names (eg: only letters).
Parameters
----------
ascii_only
Indicate whether to consider only ASCII alphabetic characters, or the full
Unicode range of valid letters (accented, idiographic, etc).
ignore_spaces
Indicate whether to ignore the presence of spaces in column names; if so,
only the other (non-space) characters are considered.
Notes
-----
Matching column names cannot contain *any* non-alphabetic characters. Note
that the definition of "alphabetic" consists of all valid Unicode alphabetic
characters by default; this can be changed by setting `ascii_only=True`.
characters (`\p{Alphabetic}`) by default; this can be changed by setting
`ascii_only=True`.
Examples
--------
Expand All @@ -489,7 +520,7 @@ def alpha(ascii_only: bool = False) -> SelectorType: # noqa: FBT001
... {
... "no1": [100, 200, 300],
... "café": ["espresso", "latte", "mocha"],
... "t/f": [True, False, None],
... "t or f": [True, False, None],
... "hmm": ["aaa", "bbb", "ccc"],
... "都市": ["東京", "大阪", "京都"],
... }
Expand Down Expand Up @@ -524,53 +555,88 @@ def alpha(ascii_only: bool = False) -> SelectorType: # noqa: FBT001
│ ccc │
└─────┘
>>> df.select(cs.alpha(ascii_only=True, ignore_spaces=True))
shape: (3, 2)
┌────────┬─────┐
│ t or f ┆ hmm │
│ --- ┆ --- │
│ bool ┆ str │
╞════════╪═════╡
│ true ┆ aaa │
│ false ┆ bbb │
│ null ┆ ccc │
└────────┴─────┘
Select all columns *except* for those with alphabetic names:
>>> df.select(~cs.alpha())
shape: (3, 2)
┌─────┬───────┐
│ no1 ┆ t/f │
│ --- ┆ --- │
│ i64 ┆ bool │
╞═════╪═══════╡
│ 100 ┆ true │
│ 200 ┆ false │
│ 300 ┆ null │
└─────┴───────┘
┌─────┬────────┐
│ no1 ┆ t or f │
│ --- ┆ --- │
│ i64 ┆ bool │
╞═════╪════════╡
│ 100 ┆ true │
│ 200 ┆ false │
│ 300 ┆ null │
└─────┴────────┘
>>> df.select(~cs.alpha(ignore_spaces=True))
shape: (3, 1)
┌─────┐
│ no1 │
│ --- │
│ i64 │
╞═════╡
│ 100 │
│ 200 │
│ 300 │
└─────┘
"""
# note that we need to supply a pattern compatible with the *rust* regex crate
re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
re_space = " " if ignore_spaces else ""
return _selector_proxy_(
F.col(f"^[{re_alpha}]+$"),
F.col(f"^[{re_alpha}{re_space}]+$"),
name="alpha",
parameters={"ascii_only": ascii_only},
parameters={"ascii_only": ascii_only, "ignore_spaces": ignore_spaces},
)


def alphanumeric(ascii_only: bool = False) -> SelectorType: # noqa: FBT001
"""
def alphanumeric(
ascii_only: bool = False, # noqa: FBT001
*,
ignore_spaces: bool = False,
) -> SelectorType:
r"""
Select all columns with alphanumeric names (eg: only letters and the digits 0-9).
Parameters
----------
ascii_only
Indicate whether to consider only ASCII alphabetic characters, or the full
Unicode range of valid letters (accented, idiographic, etc).
ignore_spaces
Indicate whether to ignore the presence of spaces in column names; if so,
only the other (non-space) characters are considered.
Notes
-----
Matching column names cannot contain *any* non-alphabetic or integer characters.
Note that the definition of "alphabetic" consists of all valid Unicode alphabetic
characters by default; this can be changed by setting `ascii_only=True`.
characters (`\p{Alphabetic}`) and digit characters (`\d`) by default; this
can be changed by setting `ascii_only=True`.
Examples
--------
>>> import polars as pl
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "1st_value": [100, 200, 300],
... "1st_col": [100, 200, 300],
... "flagged": [True, False, True],
... "00prefix": ["01:aa", "02:bb", "03:cc"],
... "last col": ["x", "y", "z"],
... }
... )
Expand All @@ -588,25 +654,52 @@ def alphanumeric(ascii_only: bool = False) -> SelectorType: # noqa: FBT001
│ true ┆ 03:cc │
└─────────┴──────────┘
>>> df.select(cs.alphanumeric(ignore_spaces=True))
shape: (3, 3)
┌─────────┬──────────┬──────────┐
│ flagged ┆ 00prefix ┆ last col │
│ --- ┆ --- ┆ --- │
│ bool ┆ str ┆ str │
╞═════════╪══════════╪══════════╡
│ true ┆ 01:aa ┆ x │
│ false ┆ 02:bb ┆ y │
│ true ┆ 03:cc ┆ z │
└─────────┴──────────┴──────────┘
Select all columns *except* for those with alphanumeric names:
>>> df.select(~cs.alphanumeric())
shape: (3, 2)
┌─────────┬──────────┐
│ 1st_col ┆ last col │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════════╪══════════╡
│ 100 ┆ x │
│ 200 ┆ y │
│ 300 ┆ z │
└─────────┴──────────┘
>>> df.select(~cs.alphanumeric(ignore_spaces=True))
shape: (3, 1)
┌───────────┐
│ 1st_value │
│ --- │
│ i64 │
╞═══════════╡
│ 100 │
│ 200 │
│ 300 │
└───────────┘
"""
┌─────────┐
│ 1st_col │
│ --- │
│ i64 │
╞═════════╡
│ 100 │
│ 200 │
│ 300 │
└─────────┘
"""
# note that we need to supply patterns compatible with the *rust* regex crate
re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
re_digit = "0-9" if ascii_only else r"\d"
re_space = " " if ignore_spaces else ""
return _selector_proxy_(
F.col(f"^[{re_alpha}0-9]+$"),
F.col(f"^[{re_alpha}{re_digit}{re_space}]+$"),
name="alphanumeric",
parameters={"ascii_only": ascii_only},
parameters={"ascii_only": ascii_only, "ignore_spaces": ignore_spaces},
)


Expand Down Expand Up @@ -1363,9 +1456,15 @@ def decimal() -> SelectorType:
return _selector_proxy_(F.col(Decimal), name="decimal")


def digit() -> SelectorType:
"""
Select all columns having names consisting only of digits (eg: integer 0-9).
def digit(ascii_only: bool = False) -> SelectorType: # noqa: FBT001
r"""
Select all columns having names consisting only of digits.
Notes
-----
Matching column names cannot contain *any* non-digit characters. Note that the
definition of "digit" consists of all valid Unicode digit characters (`\d`)
by default; this can be changed by setting `ascii_only=True`.
Examples
--------
Expand Down Expand Up @@ -1419,8 +1518,33 @@ def digit() -> SelectorType:
│ aaa │
│ bbb │
└─────┘
Demonstrate use of `ascii_only` flag (by default all valid unicode digits
are considered, but this can be constrained to ascii 0-9):
>>> df = pl.DataFrame({"१९९९": [1999], "२०७७": [2077], "3000": [3000]})
>>> df.select(cs.digit())
shape: (1, 3)
┌──────┬──────┬──────┐
│ १९९९ ┆ २०७७ ┆ 3000 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╡
│ 1999 ┆ 2077 ┆ 3000 │
└──────┴──────┴──────┘
>>> df.select(cs.digit(ascii_only=True))
shape: (1, 1)
┌──────┐
│ 3000 │
│ --- │
│ i64 │
╞══════╡
│ 3000 │
└──────┘
"""
return _selector_proxy_(F.col(r"^\d+$"), name="digit")
re_digit = r"[0-9]" if ascii_only else r"\d"
return _selector_proxy_(F.col(rf"^{re_digit}+$"), name="digit")


def duration(
Expand Down
Loading

0 comments on commit ceb895e

Please sign in to comment.