In [None]:
import polars as pl
import math
import numpy as np
from numba import float64, guvectorize, int64


In [None]:
df = pl.DataFrame(
    {
        "keys": ["a", "a", "b", "b"],
        "values": [10, 7, 1, 23],
    }
)
print(df)

In [None]:
# log() function on each individual value

def my_log(x):
    return math.log(x)

result = df.select(
    pl.col("values")
    .map_elements(lambda x: my_log(x),
                  return_dtype=pl.Float64)
)
print(result)

In [None]:
# processing a whole series with mat_batches()
def diff_from_mean(series):
    total = 0
    for value in series:
        total += value
    mean = total / len(series)
    return pl.Series([value - mean for value in series])

result = df.select(
    pl.col("values")
    .map_batches(diff_from_mean,
                 return_dtype=pl.Float64)
)
print(result)

result = (df
          .group_by("keys")
          .agg(pl.col("values")
               .map_batches(diff_from_mean)
               )
)
print(result)


Fast UDFS

In [None]:
# using numpys ufuncs
result = df.select(
    pl.col("values")
    .map_batches(np.log)
)
print(result)

In [None]:
# using numba

@guvectorize([(int64[:], float64[:])], '(n)->(n)')
def diff_from_mean_numba(arr, result):
    total = 0
    for value in arr:
        total += value
    mean = total / len(arr)
    for i, value in enumerate(arr):
        result[i] = value - mean

result = df.select(
    pl.col("values")
    .map_batches(diff_from_mean_numba)
)
print(result)

result = (df
          .group_by("keys")
           .agg(pl.col("values")
                .map_batches(diff_from_mean_numba)
            )
)
print(result)

In [None]:
# passing multiple column values with structs
# Add two arrays together:
@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
def add(arr, arr2, result):
    for i in range(len(arr)):
        result[i] = arr[i] + arr2[i]


df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})

out = df3.select(
    # Create a struct that has two columns in it:
    pl.struct(["values1", "values2"])
    # Pass the struct to a lambda that then passes the individual columns to
    # the add() function:
    .map_batches(
        lambda combined: add(
            combined.struct.field("values1"), combined.struct.field("values2")
        )
    )
    .alias("add_columns")
)
print(out)

Using numpy functions

In [None]:
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

result = df.select(
    np.log(pl.all()).name.suffix("_log")
)
print(result)