# 16. Extending Polars

In [2]:
import polars as pl
pl.__version__

'1.20.0'

## User-Defined Functions in Python

### Applying a Function to Elements

In [2]:
from textblob import TextBlob


def analyze_sentiment(review):
    return TextBlob(review).sentiment.polarity


reviews = pl.DataFrame(
    {
        "reviews": [
            "This product is great!",
            "Terrible service.",
            "Okay, but not what I expected.",
            "Excellent! I love it.",
        ]
    }
)

reviews.with_columns(
    sentiment_score=pl.col("reviews").map_elements(
        analyze_sentiment, return_dtype=pl.Float64
    )
)

reviews,sentiment_score
str,f64
"""This product is great!""",1.0
"""Terrible service.""",-1.0
"""Okay, but not what I expected.""",0.2
"""Excellent! I love it.""",0.75


In [3]:
ints = pl.DataFrame({"x": [1, 2, 3, 4]})


def add_one(x):
    return x + 1


ints.with_columns(
    pl.col("x")
    .map_elements(
        add_one,
        return_dtype=pl.Int64,
    )
    .alias("x + 1")
)

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("x").map_elements(add_one)
with this one instead:
  + pl.col("x") + 1

  .map_elements(


x,x + 1
i64,i64
1,2
2,3
3,4
4,5


### Applying a Function to a Series

In [4]:
import polars.selectors as cs
from scipy.special import softmax

ml_dataset = pl.DataFrame(
    {
        "feature1": [0.3, 0.2, 0.4, 0.1, 0.2, 0.3, 0.5],
        "feature2": [32, 50, 70, 65, 0, 10, 15],
        "label": [1, 0, 1, 0, 1, 0, 0],
    }
)

ml_dataset.select(
    "label",
    cs.starts_with("feature").map_batches(
        lambda x: softmax(x.to_numpy()),
    ),
)

label,feature1,feature2
i64,f64,f64
1,0.143782,3.1181e-17
0,0.130099,2.0474e-09
1,0.158904,0.993307
0,0.117719,0.006693
1,0.130099,3.9488e-31
0,0.143782,8.6979e-27
0,0.175616,1.2909000000000001e-24


### Applying a Function to Groups

In [5]:
from sklearn.preprocessing import StandardScaler


def scale_temperature(group):
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(group[["temperature"]].to_numpy())
    return group.with_columns(
        pl.Series(values=scaled_values.flatten(), name="scaled_feature")
    )


temperatures = pl.DataFrame(
    {
        "country": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
        "temperature": [32, 50, 70, 65, 0, 10, 15],
    }
)

temperatures.group_by("country").map_groups(scale_temperature)

country,temperature,scaled_feature
str,i64,f64
"""NL""",0,-1.336306
"""NL""",10,0.267261
"""NL""",15,1.069045
"""USA""",32,-1.502872
"""USA""",50,-0.287066
"""USA""",70,1.063831
"""USA""",65,0.726107


In [6]:
temperatures = pl.DataFrame(
    {
        "country": ["USA", "USA", "USA", "USA", "NL", "NL", "NL"],
        "temperature": [32, 50, 70, 65, 0, 10, 15],
    }
)

for group, df in temperatures.group_by("country"):
    print(f"{group[0]}:\n{df}\n")

NL:
shape: (3, 2)
┌─────────┬─────────────┐
│ country ┆ temperature │
│ ---     ┆ ---         │
│ str     ┆ i64         │
╞═════════╪═════════════╡
│ NL      ┆ 0           │
│ NL      ┆ 10          │
│ NL      ┆ 15          │
└─────────┴─────────────┘

USA:
shape: (4, 2)
┌─────────┬─────────────┐
│ country ┆ temperature │
│ ---     ┆ ---         │
│ str     ┆ i64         │
╞═════════╪═════════════╡
│ USA     ┆ 32          │
│ USA     ┆ 50          │
│ USA     ┆ 70          │
│ USA     ┆ 65          │
└─────────┴─────────────┘



In [7]:
from functools import lru_cache

from textblob import TextBlob


@lru_cache(maxsize=256)
def analyze_sentiment(review):
    return TextBlob(review).sentiment.polarity


reviews = pl.DataFrame(
    {
        "reviews": [
            "This product is great!",
            "Terrible service.",
            "Okay, but not what I expected.",
            "Excellent! I love it.",
        ]
    }
)

reviews.with_columns(
    sentiment_score=pl.col("reviews").map_elements(
        analyze_sentiment, return_dtype=pl.Float64
    )
)

reviews,sentiment_score
str,f64
"""This product is great!""",1.0
"""Terrible service.""",-1.0
"""Okay, but not what I expected.""",0.2
"""Excellent! I love it.""",0.75


### Applying a Function to an Expression

In [8]:
addresses = pl.DataFrame(
    {
        "address": [
            "Nieuwezijds Voorburgwal 147",
            "Museumstraat 1",
            "Oosterdok 2",
        ]
    }
)


def extract_house_number(input_expr: pl.Expr) -> pl.Expr:
    """Extract the house number from an address String"""
    return input_expr.str.extract(r"\d+", 0).cast(pl.Int64)


addresses.with_columns(
    house_numbers=pl.col("address").pipe(extract_house_number)
)

address,house_numbers
str,i64
"""Nieuwezijds Voorburgwal 147""",147
"""Museumstraat 1""",1
"""Oosterdok 2""",2


### Applying a Function to a DataFrame or LazyFrame

In [9]:
small_numbers = pl.DataFrame({"ints": [2, 4, 6], "floats": [10.0, 20.0, 30.0]})


def scale_the_input(
    df: pl.DataFrame | pl.LazyFrame, scale_factor: int
) -> pl.DataFrame | pl.LazyFrame:
    """Scales the input by the input factor"""
    return df * scale_factor


small_numbers.pipe(scale_the_input, 5)

ints,floats
f64,f64
10.0,50.0
20.0,100.0
30.0,150.0


## Registering Your Own Namespace

In [10]:
@pl.api.register_expr_namespace("celsius")
class Celsius:
    def __init__(self, expr: pl.Expr):
        self._expr = expr

    def to_fahrenheit(self) -> pl.Expr:
        return (self._expr * 9 / 5) + 32

    def to_kelvin(self) -> pl.Expr:
        return self._expr + 273.15

In [11]:
temperatures = pl.DataFrame({"celsius": [0, 10, 20, 30, 40]})

temperatures.with_columns(fahrenheit=pl.col("celsius").celsius.to_fahrenheit())

celsius,fahrenheit
i64,f64
0,32.0
10,50.0
20,68.0
30,86.0
40,104.0


## Polars Plugins in Rust

### Prerequisites

In [12]:
! rustc --version

rustc 1.88.0 (6b00bc388 2025-06-23) (Homebrew)


### The Anatomy of a Plugin Project

### The Plugin

### Compiling the Plugin

In [13]:
! cd plugins/hello_world_plugin && uv run maturin develop --release

🍹 Building a mixed python/rust project
🔗 Found pyo3 bindings
🐍 Found CPython 3.12 at /home/prashant/Desktop/samosa/extra/books/code/python-polars-the-definitive-guide/python-polars-the-definitive-guide-main/.venv/bin/python
[1m[32m   Compiling[0m proc-macro2 v1.0.92
[1m[32m   Compiling[0m unicode-ident v1.0.14
[1m[32m   Compiling[0m libc v0.2.169
[1m[32m   Compiling[0m version_check v0.9.5
[1m[32m   Compiling[0m autocfg v1.4.0
[1m[32m   Compiling[0m libm v0.2.11
[1m[32m   Compiling[0m crossbeam-utils v0.8.21
[1m[32m   Compiling[0m cfg-if v1.0.0
[1m[32m   Compiling[0m byteorder v1.5.0
[1m[32m   Compiling[0m shlex v1.3.0
[1m[32m   Compiling[0m rayon-core v1.12.1
[1m[32m   Compiling[0m rustversion v1.0.19
[K[1m[32m   Compiling[0m memchr v2.7.4            ] 0/316: libm(build.rs), proc-macr...
[K[1m[32m   Compiling[0m thiserror v2.0.9            ] 1/316: libm(build.rs), proc-macr...
[K[1m[32m   Compiling[0m cc v1.2.6                ] 2/316: li

In [None]:
# Reset the kernel to make the new plugin available

# The code below will do this automatically when run in IPython
get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

: 

### Performance Benchmark

In [1]:
import polars as pl
from hello_world_func import hello_world
import time

lots_of_strings = pl.DataFrame(
    {
        "a": ["1", "2", "3", "4"] * 100_000,
    }
)

times = []
for i in range(10):
    t0 = time.time()
    out = lots_of_strings.with_columns(
        pl.col("a").str.replace_all(r".*", "Hello, world!")
    )
    t1 = time.time()
    times.append(t1 - t0)
print(
    f"Polars native string replace:        {sum(times) / len(times):.5f}"
)


times = []
for i in range(10):
    t0 = time.time()
    out = lots_of_strings.with_columns(hello_world("a"))
    t1 = time.time()
    times.append(t1 - t0)
print(f"Our custom made Hello world replace: {sum(times) / len(times):.5f}")

Polars native string replace:        0.14572
Our custom made Hello world replace: 0.03959


### Register Arguments

#### Working with multiple arguments as input

#### Other register arguments

### Using a Rust Crate

### Use Case: geo

#### Adding the geo crate

#### The Rust code

In [2]:
! cd plugins/polars_geo && uv run maturin develop --release

🍹 Building a mixed python/rust project
🔗 Found pyo3 bindings with abi3 support for Python ≥ 3.8
🐍 Not using a specific python interpreter
[1m[32m    Finished[0m `release` profile [optimized] target(s) in 0.58s
📦 Built wheel for abi3 Python ≥ 3.8 to /tmp/.tmpgzfIYM/polars_geo-1.0.0-cp38-abi3-linux_x86_64.whl
✏️ Setting installed package as editable
🛠 Installed polars_geo-1.0.0


In [None]:
# Reset the kernel to make the new plugin available

# The code below will do this automatically when run in IPython
get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

: 

#### The Python code

#### Making the custom namespace

In [3]:
points_and_polygons = pl.DataFrame(
    {
        "point": [[5.0, 5.0], [20.0, 20.0], [20.0, 20.0]],
        "polygon": [
            [[0.0, 0.0], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0]],
            [
                [0.0, 0.0],
                [10.0, 0.0],
                [10.0, 10.0],
            ],
            [[0.0, None], [10.0, 0.0], [10.0, 10.0], [0.0, 10.0], [0.0, 0.0]],
        ],
    }
)

In [4]:
from plugins.polars_geo import polars_geo

# Apply the point_in_polygon function
points_and_polygons.with_columns(
    pl.col("point").geo.point_in_polygon(pl.col("polygon")).alias("in_polygon")
)

point,polygon,in_polygon
list[f64],list[list[f64]],bool
"[5.0, 5.0]","[[0.0, 0.0], [10.0, 0.0], … [0.0, 10.0]]",True
"[20.0, 20.0]","[[0.0, 0.0], [10.0, 0.0], [10.0, 10.0]]",False
"[20.0, 20.0]","[[0.0, null], [10.0, 0.0], … [0.0, 0.0]]",
