Goals:

- Introduce Polars expressions
- How to write SQL like queries
- Advanced topics - window functions, join, pivot, exploding lists, conditional logic and custom UDFs


In [0]:
!pip install polars

Collecting polars
  Downloading polars-1.34.0-py3-none-any.whl.metadata (10 kB)
Collecting polars-runtime-32==1.34.0 (from polars)
  Downloading polars_runtime_32-1.34.0-cp39-abi3-manylinux_2_24_aarch64.whl.metadata (1.5 kB)
Downloading polars-1.34.0-py3-none-any.whl (772 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.7/772.7 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading polars_runtime_32-1.34.0-cp39-abi3-manylinux_2_24_aarch64.whl (37.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/37.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/37.0 MB[0m [31m79.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m37.0/37.0 MB[0m [31m95.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━

In [0]:
import polars as pl
import numpy as np
import datetime as dt

In [0]:
# Create a sample dataset to use 

np.random.seed(42)

n = 20
cats = ["A", "B", "C"]

df = pl.DataFrame({
    "id": np.arange(1,n+1),
    "category": np.random.choice(cats, size=n),
    "value": np.random.normal(loc=50, scale=15, size=n).round(2),
    "timestamp": [dt.datetime(2025, 1, 1) + dt.timedelta(days=int(x)) for x in np.random.randint(0, 30, size=n)]
})

# add list column for explode demo

df = df.with_columns([
    pl.int_ranges(1, 1 + (pl.col("id") % 4)).alias("small_list")
])

In [0]:
print(df)

shape: (20, 5)
┌─────┬──────────┬───────┬─────────────────────┬────────────┐
│ id  ┆ category ┆ value ┆ timestamp           ┆ small_list │
│ --- ┆ ---      ┆ ---   ┆ ---                 ┆ ---        │
│ i64 ┆ str      ┆ f64   ┆ datetime[μs]        ┆ list[i64]  │
╞═════╪══════════╪═══════╪═════════════════════╪════════════╡
│ 1   ┆ C        ┆ 43.05 ┆ 2025-01-20 00:00:00 ┆ [1]        │
│ 2   ┆ A        ┆ 43.01 ┆ 2025-01-28 00:00:00 ┆ [1, 2]     │
│ 3   ┆ C        ┆ 53.63 ┆ 2025-01-15 00:00:00 ┆ [1, 2, 3]  │
│ 4   ┆ C        ┆ 21.3  ┆ 2025-01-28 00:00:00 ┆ []         │
│ 5   ┆ A        ┆ 24.13 ┆ 2025-01-07 00:00:00 ┆ [1]        │
│ …   ┆ …        ┆ …     ┆ …                   ┆ …          │
│ 16  ┆ A        ┆ 51.66 ┆ 2025-01-04 00:00:00 ┆ []         │
│ 17  ┆ B        ┆ 32.74 ┆ 2025-01-02 00:00:00 ┆ [1]        │
│ 18  ┆ B        ┆ 55.64 ┆ 2025-01-30 00:00:00 ┆ [1, 2]     │
│ 19  ┆ B        ┆ 40.99 ┆ 2025-01-06 00:00:00 ┆ [1, 2, 3]  │
│ 20  ┆ B        ┆ 45.62 ┆ 2025-01-22 00:00:00 ┆ []    

1. Basic expressions

- pl.col("colname") - references a column
- pl.lit(value) - literal value
- expr.alias("new_name") - rename expression
- expr1 + expr2, expr * 2 etc - arithmetic expressions


In [0]:
(
    df.select([
        pl.col("id"),
        pl.col("value"),
        (pl.col("value") * 1.1).alias("value_plus_10pct"),
        pl.col("category").alias("cat")
    ])
)

id,value,value_plus_10pct,cat
i64,f64,f64,str
1,43.05,47.355,"""C"""
2,43.01,47.311,"""A"""
3,53.63,58.993,"""C"""
4,21.3,23.43,"""C"""
5,24.13,26.543,"""A"""
…,…,…,…
16,51.66,56.826,"""A"""
17,32.74,36.014,"""B"""
18,55.64,61.204,"""B"""
19,40.99,45.089,"""B"""


In [0]:
# Using .round() and .floor etc.

(
    df.select([
        pl.col("value").mean().alias("mean_value"),
        pl.col("value").median().alias("median_value"),
        pl.col("value").std().alias("std_value")
    ])
)

mean_value,median_value,std_value
f64,f64,f64
42.406,42.42,12.268675


2. select, with_columns, with_column

- select - creates a new DataFrame with only the provided expressions
- with_columns - adds or replaces multiple columns
- with_column - adds a single column

In [0]:
# Add computed columns using expressions

(
    df.with_columns([
        (pl.col("value") / pl.col("value").sum()).alias("value_share"),
        (pl.col("value").rank("dense")).alias("dense_rank")
    ])
    .select(["id", "category", "value", "value_share", "dense_rank"])
)

id,category,value,value_share,dense_rank
i64,str,f64,f64,u32
1,"""C""",43.05,0.050759,12
2,"""A""",43.01,0.050712,11
3,"""C""",53.63,0.063234,17
4,"""C""",21.3,0.025114,1
5,"""A""",24.13,0.028451,2
…,…,…,…,…
16,"""A""",51.66,0.060911,16
17,"""B""",32.74,0.038603,5
18,"""B""",55.64,0.065604,19
19,"""B""",40.99,0.04833,8


3. Filtering(filter) and boolean logic expressions can be used inside filter

In [0]:
# Filter rows where value > mean

mean_val = df.select(pl.col("value").mean()).item()

high = df.filter(pl.col("value") > mean_val)

print("mean value:", mean_val)
print(high)

mean value: 42.406
shape: (10, 5)
┌─────┬──────────┬───────┬─────────────────────┬────────────┐
│ id  ┆ category ┆ value ┆ timestamp           ┆ small_list │
│ --- ┆ ---      ┆ ---   ┆ ---                 ┆ ---        │
│ i64 ┆ str      ┆ f64   ┆ datetime[μs]        ┆ list[i64]  │
╞═════╪══════════╪═══════╪═════════════════════╪════════════╡
│ 1   ┆ C        ┆ 43.05 ┆ 2025-01-20 00:00:00 ┆ [1]        │
│ 2   ┆ A        ┆ 43.01 ┆ 2025-01-28 00:00:00 ┆ [1, 2]     │
│ 3   ┆ C        ┆ 53.63 ┆ 2025-01-15 00:00:00 ┆ [1, 2, 3]  │
│ 8   ┆ B        ┆ 54.71 ┆ 2025-01-08 00:00:00 ┆ []         │
│ 11  ┆ C        ┆ 71.98 ┆ 2025-01-14 00:00:00 ┆ [1, 2, 3]  │
│ 12  ┆ C        ┆ 46.61 ┆ 2025-01-17 00:00:00 ┆ []         │
│ 13  ┆ A        ┆ 51.01 ┆ 2025-01-04 00:00:00 ┆ [1]        │
│ 16  ┆ A        ┆ 51.66 ┆ 2025-01-04 00:00:00 ┆ []         │
│ 18  ┆ B        ┆ 55.64 ┆ 2025-01-30 00:00:00 ┆ [1, 2]     │
│ 20  ┆ B        ┆ 45.62 ┆ 2025-01-22 00:00:00 ┆ []         │
└─────┴──────────┴───────┴──────────

In [0]:
df.filter((pl.col("category") == "C") & (pl.col("value") > 60))

id,category,value,timestamp,small_list
i64,str,f64,datetime[μs],list[i64]
11,"""C""",71.98,2025-01-14 00:00:00,"[1, 2, 3]"


4. Groupby & Aggregations 

- groupby accepts columns and then use .agg() with expressions


In [0]:
(
    df.group_by("category")
    .agg([
        pl.len().alias("n"),  # calculate number of rows in each category
        pl.col("value").mean().alias("avg_value"), # average value for each category
        pl.col("value").median().alias("median_value"), # median value for each category
        pl.col("value").std().alias("std_value"), # std value for each category
        pl.col("id").max().alias("max_id") # max id for each category
    ])
)

category,n,avg_value,median_value,std_value,max_id
str,u32,f64,f64,f64,i64
"""B""",6,45.255,43.725,8.7627,20
"""A""",5,42.276,43.01,11.12044,16
"""C""",9,40.578889,36.38,15.421072,14


In [0]:
# more than one column in groupby

# avg value and number of rows on each weekday for each category

(
    df.with_columns((pl.col("timestamp").dt.weekday().alias("weekday")))
      .group_by(["category", "weekday"])
      .agg([
          pl.len().alias("n"),
          pl.col("value").mean().alias("avg_value")
      ])
      .sort(["category", "weekday"])

)

category,weekday,n,avg_value
str,i8,u32,f64
"""A""",2,2,33.57
"""A""",6,2,51.335
"""A""",7,1,41.57
"""B""",1,1,40.99
"""B""",3,3,47.386667
…,…,…,…
"""C""",1,1,43.05
"""C""",2,2,46.64
"""C""",3,3,41.606667
"""C""",5,2,37.715


For next part:

1. Window functions
2. Joins
3. Pivot
4. List and explode
5. Conditional logic - when/otherwise
6. Lazy dataframe best practices