py-polars/polars/expr/array.py

from __future__ import annotations

from typing import TYPE_CHECKING, Callable, Sequence

from polars._utils.parse_expr_input import parse_as_expression
from polars._utils.wrap import wrap_expr

if TYPE_CHECKING:
    from datetime import date, datetime, time

    from polars import Expr
    from polars.type_aliases import IntoExpr, IntoExprColumn


class ExprArrayNameSpace:
    """Namespace for array related expressions."""

    _accessor = "arr"

    def __init__(self, expr: Expr):
        self._pyexpr = expr._pyexpr

    def min(self) -> Expr:
        """
        Compute the min values of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={"a": [[1, 2], [4, 3]]},
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.select(pl.col("a").arr.min())
        shape: (2, 1)
        ┌─────┐
        │ a   │
        │ --- │
        │ i64 │
        ╞═════╡
        │ 1   │
        │ 3   │
        └─────┘
        """
        return wrap_expr(self._pyexpr.arr_min())

    def max(self) -> Expr:
        """
        Compute the max values of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={"a": [[1, 2], [4, 3]]},
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.select(pl.col("a").arr.max())
        shape: (2, 1)
        ┌─────┐
        │ a   │
        │ --- │
        │ i64 │
        ╞═════╡
        │ 2   │
        │ 4   │
        └─────┘
        """
        return wrap_expr(self._pyexpr.arr_max())

    def sum(self) -> Expr:
        """
        Compute the sum values of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={"a": [[1, 2], [4, 3]]},
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.select(pl.col("a").arr.sum())
        shape: (2, 1)
        ┌─────┐
        │ a   │
        │ --- │
        │ i64 │
        ╞═════╡
        │ 3   │
        │ 7   │
        └─────┘
        """
        return wrap_expr(self._pyexpr.arr_sum())

    def std(self, ddof: int = 1) -> Expr:
        """
        Compute the std of the values of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={"a": [[1, 2], [4, 3]]},
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.select(pl.col("a").arr.std())
        shape: (2, 1)
        ┌──────────┐
        │ a        │
        │ ---      │
        │ f64      │
        ╞══════════╡
        │ 0.707107 │
        │ 0.707107 │
        └──────────┘
        """
        return wrap_expr(self._pyexpr.arr_std(ddof))

    def var(self, ddof: int = 1) -> Expr:
        """
        Compute the var of the values of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={"a": [[1, 2], [4, 3]]},
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.select(pl.col("a").arr.var())
        shape: (2, 1)
        ┌─────┐
        │ a   │
        │ --- │
        │ f64 │
        ╞═════╡
        │ 0.5 │
        │ 0.5 │
        └─────┘
        """
        return wrap_expr(self._pyexpr.arr_var(ddof))

    def median(self) -> Expr:
        """
        Compute the median of the values of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={"a": [[1, 2], [4, 3]]},
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.select(pl.col("a").arr.median())
        shape: (2, 1)
        ┌─────┐
        │ a   │
        │ --- │
        │ f64 │
        ╞═════╡
        │ 1.5 │
        │ 3.5 │
        └─────┘
        """
        return wrap_expr(self._pyexpr.arr_median())

    def unique(self, *, maintain_order: bool = False) -> Expr:
        """
        Get the unique/distinct values in the array.

        Parameters
        ----------
        maintain_order
            Maintain order of data. This requires more work.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {
        ...         "a": [[1, 1, 2]],
        ...     },
        ...     schema={"a": pl.Array(pl.Int64, 3)},
        ... )
        >>> df.select(pl.col("a").arr.unique())
        shape: (1, 1)
        ┌───────────┐
        │ a         │
        │ ---       │
        │ list[i64] │
        ╞═══════════╡
        │ [1, 2]    │
        └───────────┘
        """
        return wrap_expr(self._pyexpr.arr_unique(maintain_order))

    def n_unique(self) -> Expr:
        """
        Count the number of unique values in every sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {
        ...         "a": [[1, 1, 2], [2, 3, 4]],
        ...     },
        ...     schema={"a": pl.Array(pl.Int64, 3)},
        ... )
        >>> df.with_columns(n_unique=pl.col("a").arr.n_unique())
        shape: (2, 2)
        ┌───────────────┬──────────┐
        │ a             ┆ n_unique │
        │ ---           ┆ ---      │
        │ array[i64, 3] ┆ u32      │
        ╞═══════════════╪══════════╡
        │ [1, 1, 2]     ┆ 2        │
        │ [2, 3, 4]     ┆ 3        │
        └───────────────┴──────────┘
        """
        return wrap_expr(self._pyexpr.arr_n_unique())

    def to_list(self) -> Expr:
        """
        Convert an Array column into a List column with the same inner data type.

        Returns
        -------
        Expr
            Expression of data type :class:`List`.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={"a": [[1, 2], [3, 4]]},
        ...     schema={"a": pl.Array(pl.Int8, 2)},
        ... )
        >>> df.select(pl.col("a").arr.to_list())
        shape: (2, 1)
        ┌──────────┐
        │ a        │
        │ ---      │
        │ list[i8] │
        ╞══════════╡
        │ [1, 2]   │
        │ [3, 4]   │
        └──────────┘
        """
        return wrap_expr(self._pyexpr.arr_to_list())

    def any(self) -> Expr:
        """
        Evaluate whether any boolean value is true for every subarray.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={
        ...         "a": [
        ...             [True, True],
        ...             [False, True],
        ...             [False, False],
        ...             [None, None],
        ...             None,
        ...         ]
        ...     },
        ...     schema={"a": pl.Array(pl.Boolean, 2)},
        ... )
        >>> df.with_columns(any=pl.col("a").arr.any())
        shape: (5, 2)
        ┌────────────────┬───────┐
        │ a              ┆ any   │
        │ ---            ┆ ---   │
        │ array[bool, 2] ┆ bool  │
        ╞════════════════╪═══════╡
        │ [true, true]   ┆ true  │
        │ [false, true]  ┆ true  │
        │ [false, false] ┆ false │
        │ [null, null]   ┆ false │
        │ null           ┆ null  │
        └────────────────┴───────┘
        """
        return wrap_expr(self._pyexpr.arr_any())

    def all(self) -> Expr:
        """
        Evaluate whether all boolean values are true for every subarray.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     data={
        ...         "a": [
        ...             [True, True],
        ...             [False, True],
        ...             [False, False],
        ...             [None, None],
        ...             None,
        ...         ]
        ...     },
        ...     schema={"a": pl.Array(pl.Boolean, 2)},
        ... )
        >>> df.with_columns(all=pl.col("a").arr.all())
        shape: (5, 2)
        ┌────────────────┬───────┐
        │ a              ┆ all   │
        │ ---            ┆ ---   │
        │ array[bool, 2] ┆ bool  │
        ╞════════════════╪═══════╡
        │ [true, true]   ┆ true  │
        │ [false, true]  ┆ false │
        │ [false, false] ┆ false │
        │ [null, null]   ┆ true  │
        │ null           ┆ null  │
        └────────────────┴───────┘
        """
        return wrap_expr(self._pyexpr.arr_all())

    def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Expr:
        """
        Sort the arrays in this column.

        Parameters
        ----------
        descending
            Sort in descending order.
        nulls_last
            Place null values last.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {
        ...         "a": [[3, 2, 1], [9, 1, 2]],
        ...     },
        ...     schema={"a": pl.Array(pl.Int64, 3)},
        ... )
        >>> df.with_columns(sort=pl.col("a").arr.sort())
        shape: (2, 2)
        ┌───────────────┬───────────────┐
        │ a             ┆ sort          │
        │ ---           ┆ ---           │
        │ array[i64, 3] ┆ array[i64, 3] │
        ╞═══════════════╪═══════════════╡
        │ [3, 2, 1]     ┆ [1, 2, 3]     │
        │ [9, 1, 2]     ┆ [1, 2, 9]     │
        └───────────────┴───────────────┘
        >>> df.with_columns(sort=pl.col("a").arr.sort(descending=True))
        shape: (2, 2)
        ┌───────────────┬───────────────┐
        │ a             ┆ sort          │
        │ ---           ┆ ---           │
        │ array[i64, 3] ┆ array[i64, 3] │
        ╞═══════════════╪═══════════════╡
        │ [3, 2, 1]     ┆ [3, 2, 1]     │
        │ [9, 1, 2]     ┆ [9, 2, 1]     │
        └───────────────┴───────────────┘

        """
        return wrap_expr(self._pyexpr.arr_sort(descending, nulls_last))

    def reverse(self) -> Expr:
        """
        Reverse the arrays in this column.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {
        ...         "a": [[3, 2, 1], [9, 1, 2]],
        ...     },
        ...     schema={"a": pl.Array(pl.Int64, 3)},
        ... )
        >>> df.with_columns(reverse=pl.col("a").arr.reverse())
        shape: (2, 2)
        ┌───────────────┬───────────────┐
        │ a             ┆ reverse       │
        │ ---           ┆ ---           │
        │ array[i64, 3] ┆ array[i64, 3] │
        ╞═══════════════╪═══════════════╡
        │ [3, 2, 1]     ┆ [1, 2, 3]     │
        │ [9, 1, 2]     ┆ [2, 1, 9]     │
        └───────────────┴───────────────┘

        """
        return wrap_expr(self._pyexpr.arr_reverse())

    def arg_min(self) -> Expr:
        """
        Retrieve the index of the minimal value in every sub-array.

        Returns
        -------
        Expr
            Expression of data type :class:`UInt32` or :class:`UInt64`
            (depending on compilation).

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {
        ...         "a": [[1, 2], [2, 1]],
        ...     },
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.with_columns(arg_min=pl.col("a").arr.arg_min())
        shape: (2, 2)
        ┌───────────────┬─────────┐
        │ a             ┆ arg_min │
        │ ---           ┆ ---     │
        │ array[i64, 2] ┆ u32     │
        ╞═══════════════╪═════════╡
        │ [1, 2]        ┆ 0       │
        │ [2, 1]        ┆ 1       │
        └───────────────┴─────────┘

        """
        return wrap_expr(self._pyexpr.arr_arg_min())

    def arg_max(self) -> Expr:
        """
        Retrieve the index of the maximum value in every sub-array.

        Returns
        -------
        Expr
            Expression of data type :class:`UInt32` or :class:`UInt64`
            (depending on compilation).

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {
        ...         "a": [[1, 2], [2, 1]],
        ...     },
        ...     schema={"a": pl.Array(pl.Int64, 2)},
        ... )
        >>> df.with_columns(arg_max=pl.col("a").arr.arg_max())
        shape: (2, 2)
        ┌───────────────┬─────────┐
        │ a             ┆ arg_max │
        │ ---           ┆ ---     │
        │ array[i64, 2] ┆ u32     │
        ╞═══════════════╪═════════╡
        │ [1, 2]        ┆ 1       │
        │ [2, 1]        ┆ 0       │
        └───────────────┴─────────┘

        """
        return wrap_expr(self._pyexpr.arr_arg_max())

    def get(self, index: int | IntoExprColumn, *, null_on_oob: bool = True) -> Expr:
        """
        Get the value by index in the sub-arrays.

        So index `0` would return the first item of every sublist
        and index `-1` would return the last item of every sublist
        if an index is out of bounds, it will return a `None`.

        Parameters
        ----------
        index
            Index to return per sub-array
        null_on_oob
            Behavior if an index is out of bounds:
            True -> set as null
            False -> raise an error

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {"arr": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], "idx": [1, -2, 0]},
        ...     schema={"arr": pl.Array(pl.Int32, 3), "idx": pl.Int32},
        ... )
        >>> df.with_columns(get=pl.col("arr").arr.get("idx", null_on_oob=True))
        shape: (3, 3)
        ┌───────────────┬─────┬─────┐
        │ arr           ┆ idx ┆ get │
        │ ---           ┆ --- ┆ --- │
        │ array[i32, 3] ┆ i32 ┆ i32 │
        ╞═══════════════╪═════╪═════╡
        │ [1, 2, 3]     ┆ 1   ┆ 2   │
        │ [4, 5, 6]     ┆ -2  ┆ 5   │
        │ [7, 8, 9]     ┆ 0   ┆ 7   │
        └───────────────┴─────┴─────┘

        """
        index = parse_as_expression(index)
        return wrap_expr(self._pyexpr.arr_get(index, null_on_oob))

    def first(self) -> Expr:
        """
        Get the first value of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]},
        ...     schema={"a": pl.Array(pl.Int32, 3)},
        ... )
        >>> df.with_columns(first=pl.col("a").arr.first())
        shape: (3, 2)
        ┌───────────────┬───────┐
        │ a             ┆ first │
        │ ---           ┆ ---   │
        │ array[i32, 3] ┆ i32   │
        ╞═══════════════╪═══════╡
        │ [1, 2, 3]     ┆ 1     │
        │ [4, 5, 6]     ┆ 4     │
        │ [7, 8, 9]     ┆ 7     │
        └───────────────┴───────┘

        """
        return self.get(0)

    def last(self) -> Expr:
        """
        Get the last value of the sub-arrays.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]},
        ...     schema={"a": pl.Array(pl.Int32, 3)},
        ... )
        >>> df.with_columns(last=pl.col("a").arr.last())
        shape: (3, 2)
        ┌───────────────┬──────┐
        │ a             ┆ last │
        │ ---           ┆ ---  │
        │ array[i32, 3] ┆ i32  │
        ╞═══════════════╪══════╡
        │ [1, 2, 3]     ┆ 3    │
        │ [4, 5, 6]     ┆ 6    │
        │ [7, 8, 9]     ┆ 9    │
        └───────────────┴──────┘

        """
        return self.get(-1)

    def join(self, separator: IntoExprColumn, *, ignore_nulls: bool = True) -> Expr:
        """
        Join all string items in a sub-array and place a separator between them.

        This errors if inner type of array `!= String`.

        Parameters
        ----------
        separator
            string to separate the items with
        ignore_nulls
            Ignore null values (default).

            If set to ``False``, null values will be propagated.
            If the sub-list contains any null values, the output is ``None``.

        Returns
        -------
        Expr
            Expression of data type :class:`String`.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {"s": [["a", "b"], ["x", "y"]], "separator": ["*", "_"]},
        ...     schema={
        ...         "s": pl.Array(pl.String, 2),
        ...         "separator": pl.String,
        ...     },
        ... )
        >>> df.with_columns(join=pl.col("s").arr.join(pl.col("separator")))
        shape: (2, 3)
        ┌───────────────┬───────────┬──────┐
        │ s             ┆ separator ┆ join │
        │ ---           ┆ ---       ┆ ---  │
        │ array[str, 2] ┆ str       ┆ str  │
        ╞═══════════════╪═══════════╪══════╡
        │ ["a", "b"]    ┆ *         ┆ a*b  │
        │ ["x", "y"]    ┆ _         ┆ x_y  │
        └───────────────┴───────────┴──────┘

        """
        separator = parse_as_expression(separator, str_as_lit=True)
        return wrap_expr(self._pyexpr.arr_join(separator, ignore_nulls))

    def explode(self) -> Expr:
        """
        Returns a column with a separate row for every array element.

        Returns
        -------
        Expr
            Expression with the data type of the array elements.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {"a": [[1, 2, 3], [4, 5, 6]]}, schema={"a": pl.Array(pl.Int64, 3)}
        ... )
        >>> df.select(pl.col("a").arr.explode())
        shape: (6, 1)
        ┌─────┐
        │ a   │
        │ --- │
        │ i64 │
        ╞═════╡
        │ 1   │
        │ 2   │
        │ 3   │
        │ 4   │
        │ 5   │
        │ 6   │
        └─────┘
        """
        return wrap_expr(self._pyexpr.explode())

    def contains(
        self, item: float | str | bool | int | date | datetime | time | IntoExprColumn
    ) -> Expr:
        """
        Check if sub-arrays contain the given item.

        Parameters
        ----------
        item
            Item that will be checked for membership

        Returns
        -------
        Expr
            Expression of data type :class:`Boolean`.

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {"a": [["a", "b"], ["x", "y"], ["a", "c"]]},
        ...     schema={"a": pl.Array(pl.String, 2)},
        ... )
        >>> df.with_columns(contains=pl.col("a").arr.contains("a"))
        shape: (3, 2)
        ┌───────────────┬──────────┐
        │ a             ┆ contains │
        │ ---           ┆ ---      │
        │ array[str, 2] ┆ bool     │
        ╞═══════════════╪══════════╡
        │ ["a", "b"]    ┆ true     │
        │ ["x", "y"]    ┆ false    │
        │ ["a", "c"]    ┆ true     │
        └───────────────┴──────────┘

        """
        item = parse_as_expression(item, str_as_lit=True)
        return wrap_expr(self._pyexpr.arr_contains(item))

    def count_matches(self, element: IntoExpr) -> Expr:
        """
        Count how often the value produced by `element` occurs.

        Parameters
        ----------
        element
            An expression that produces a single value

        Examples
        --------
        >>> df = pl.DataFrame(
        ...     {"a": [[1, 2], [1, 1], [2, 2]]}, schema={"a": pl.Array(pl.Int64, 2)}
        ... )
        >>> df.with_columns(number_of_twos=pl.col("a").arr.count_matches(2))
        shape: (3, 2)
        ┌───────────────┬────────────────┐
        │ a             ┆ number_of_twos │
        │ ---           ┆ ---            │
        │ array[i64, 2] ┆ u32            │
        ╞═══════════════╪════════════════╡
        │ [1, 2]        ┆ 1              │
        │ [1, 1]        ┆ 0              │
        │ [2, 2]        ┆ 2              │
        └───────────────┴────────────────┘
        """
        element = parse_as_expression(element, str_as_lit=True)
        return wrap_expr(self._pyexpr.arr_count_matches(element))

    def to_struct(
        self, fields: Sequence[str] | Callable[[int], str] | None = None
    ) -> Expr:
        """
        Convert the Series of type `Array` to a Series of type `Struct`.

        Parameters
        ----------
        fields
            If the name and number of the desired fields is known in advance
            a list of field names can be given, which will be assigned by index.
            Otherwise, to dynamically assign field names, a custom function can be
            used; if neither are set, fields will be `field_0, field_1 .. field_n`.

        Examples
        --------
        Convert array to struct with default field name assignment:

        >>> df = pl.DataFrame(
        ...     {"n": [[0, 1, 2], [3, 4, 5]]}, schema={"n": pl.Array(pl.Int8, 3)}
        ... )
        >>> df.with_columns(struct=pl.col("n").arr.to_struct())
        shape: (2, 2)
        ┌──────────────┬───────────┐
        │ n            ┆ struct    │
        │ ---          ┆ ---       │
        │ array[i8, 3] ┆ struct[3] │
        ╞══════════════╪═══════════╡
        │ [0, 1, 2]    ┆ {0,1,2}   │
        │ [3, 4, 5]    ┆ {3,4,5}   │
        └──────────────┴───────────┘

        Convert array to struct with field name assignment by function/index:

        >>> df = pl.DataFrame(
        ...     {"n": [[0, 1, 2], [3, 4, 5]]}, schema={"n": pl.Array(pl.Int8, 3)}
        ... )
        >>> df.select(pl.col("n").arr.to_struct(fields=lambda idx: f"n{idx}")).rows(
        ...     named=True
        ... )
        [{'n': {'n0': 0, 'n1': 1, 'n2': 2}}, {'n': {'n0': 3, 'n1': 4, 'n2': 5}}]

        Convert array to struct with field name assignment by
        index from a list of names:

        >>> df.select(pl.col("n").arr.to_struct(fields=["c1", "c2", "c3"])).rows(
        ...     named=True
        ... )
        [{'n': {'c1': 0, 'c2': 1, 'c3': 2}}, {'n': {'c1': 3, 'c2': 4, 'c3': 5}}]
        """
        if isinstance(fields, Sequence):
            field_names = list(fields)
            pyexpr = self._pyexpr.arr_to_struct(None)
            return wrap_expr(pyexpr).struct.rename_fields(field_names)
        else:
            pyexpr = self._pyexpr.arr_to_struct(fields)
            return wrap_expr(pyexpr)

    def shift(self, n: int | IntoExprColumn = 1) -> Expr:
        """
        Shift array values by the given number of indices.

        Parameters
        ----------
        n
            Number of indices to shift forward. If a negative value is passed, values
            are shifted in the opposite direction instead.

        Notes
        -----
        This method is similar to the `LAG` operation in SQL when the value for `n`
        is positive. With a negative value for `n`, it is similar to `LEAD`.

        Examples
        --------
        By default, array values are shifted forward by one index.

        >>> df = pl.DataFrame(
        ...     {"a": [[1, 2, 3], [4, 5, 6]]}, schema={"a": pl.Array(pl.Int64, 3)}
        ... )
        >>> df.with_columns(shift=pl.col("a").arr.shift())
        shape: (2, 2)
        ┌───────────────┬───────────────┐
        │ a             ┆ shift         │
        │ ---           ┆ ---           │
        │ array[i64, 3] ┆ array[i64, 3] │
        ╞═══════════════╪═══════════════╡
        │ [1, 2, 3]     ┆ [null, 1, 2]  │
        │ [4, 5, 6]     ┆ [null, 4, 5]  │
        └───────────────┴───────────────┘

        Pass a negative value to shift in the opposite direction instead.

        >>> df.with_columns(shift=pl.col("a").arr.shift(-2))
        shape: (2, 2)
        ┌───────────────┬─────────────────┐
        │ a             ┆ shift           │
        │ ---           ┆ ---             │
        │ array[i64, 3] ┆ array[i64, 3]   │
        ╞═══════════════╪═════════════════╡
        │ [1, 2, 3]     ┆ [3, null, null] │
        │ [4, 5, 6]     ┆ [6, null, null] │
        └───────────────┴─────────────────┘
        """
        n = parse_as_expression(n)
        return wrap_expr(self._pyexpr.arr_shift(n))