# Chapter 9: Combining Expressions

In [1]:
import polars as pl
pl.show_versions()

--------Version info---------
Polars:               0.20.31
Index type:           UInt32
Platform:             macOS-12.5-arm64-arm-64bit
Python:               3.11.9 (main, Apr  2 2024, 16:11:47) [Clang 14.0.0 (clang-1400.0.29.202)]

----Optional dependencies----
adbc_driver_manager:  0.8.0
cloudpickle:          3.0.0
connectorx:           0.3.2
deltalake:            0.15.0
fastexcel:            0.9.1
fsspec:               2023.12.2
gevent:               23.9.1
hvplot:               0.9.2
matplotlib:           3.8.4
nest_asyncio:         1.6.0
numpy:                1.26.4
openpyxl:             3.1.2
pandas:               2.2.2
pyarrow:              14.0.2
pydantic:             2.5.3
pyiceberg:            0.5.1
pyxlsb:               <not installed>
sqlalchemy:           2.0.25
torch:                <not installed>
xlsx2csv:             0.8.2
xlsxwriter:           3.2.0


In [2]:
fruit = pl.read_csv("data/fruit.csv")
fruit.filter(pl.col("is_round") & (pl.col("weight") > 1000))

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


## Inline Operators Versus Methods

In [4]:
(
    pl.DataFrame({
        "i": [6, 0, 2, 2.5],
        "j": [7, 1, 2, 3]
    })
    .with_columns(
        (pl.col("i") * pl.col("j")).alias("*"),
        pl.col("i").mul(pl.col("j")).alias("Expr.mul()")
    )
)

i,j,*,Expr.mul()
f64,i64,f64,f64
6.0,7,42.0,42.0
0.0,1,0.0,0.0
2.0,2,4.0,4.0
2.5,3,7.5,7.5


## Arithmetic Operations

In [6]:
fruit.select(
    pl.col("name"),
    (pl.col("weight") / 1000)
)

name,weight
str,f64
"""Avocado""",0.2
"""Banana""",0.12
"""Blueberry""",0.001
"""Cantaloupe""",2.5
"""Cranberry""",0.002
"""Elderberry""",0.001
"""Orange""",0.13
"""Papaya""",1.0
"""Peach""",0.15
"""Watermelon""",5.0


In [7]:
pl.Config(float_precision=2, tbl_cell_numeric_alignment="RIGHT") # <1>

(
    pl.DataFrame({
        "i": [0, 2, 2, -2, -2],
        "j": [1, 2, 3, 4, -5]
    })
    .with_columns(
        (pl.col("i") + pl.col("j")).alias("i + j"),
        (pl.col("i") - pl.col("j")).alias("i - j"),
        (pl.col("i") * pl.col("j")).alias("i * j"),
        (pl.col("i") / pl.col("j")).alias("i / j"),
        (pl.col("i") // pl.col("j")).alias("i // j"),
        (pl.col("i") ** pl.col("j")).alias("i ** j"),
        (pl.col("j") % 2).alias("j % 2"), # <2>
        pl.col("i").dot(pl.col("j")).alias("i ⋅ j"), # <3>
    )
)

ComputeError: conversion from `i64` to `u32` failed in column 'j' for 1 out of 5 values: [-5]

In [8]:
pl.Config.set_float_precision()
pl.Config.set_tbl_cell_numeric_alignment(None)

polars.config.Config

## Comparison Operations

In [10]:
pl.select(pl.lit("a") > pl.lit("b"))

literal
bool
False


In [11]:
(
    fruit.select(
        pl.col("name"),
        pl.col("weight"),
    )
    .filter(pl.col("weight") >= 1000)
)

name,weight
str,i64
"""Cantaloupe""",2500
"""Papaya""",1000
"""Watermelon""",5000


In [12]:
x = 4
3 < x < 5

True

In [13]:
pl.select(pl.lit(3) < pl.lit(x) < pl.lit(5))

TypeError: the truth value of an Expr is ambiguous

You probably got here by using a Python standard library function instead of the native expressions API.
Here are some things you might want to try:
- instead of `pl.col('a') and pl.col('b')`, use `pl.col('a') & pl.col('b')`
- instead of `pl.col('a') in [y, z]`, use `pl.col('a').is_in([y, z])`
- instead of `max(pl.col('a'), pl.col('b'))`, use `pl.max_horizontal(pl.col('a'), pl.col('b'))`


In [14]:
pl.select((pl.lit(3) < pl.lit(x)) & (pl.lit(x) < pl.lit(5))).item()

True

In [15]:
pl.select(pl.lit(x).is_between(3, 5)).item()

True

In [16]:
(
    pl.DataFrame({
        "a": [-273.15, 0, 42, 100],
        "b": [1.4142, 2.7183, 42, 3.1415]
    })
    .with_columns(
        (pl.col("a") == pl.col("b")).alias("a == b"),
        (pl.col("a") <= pl.col("b")).alias("a <= b"),
        (pl.all() > 0).name.suffix(" > 0"),
        ((pl.col("b") - pl.lit(2).sqrt()).abs() < 1e-3).alias("b ≈ √2"), # <1>
        ((1 < pl.col("b")) & (pl.col("b") < 3)).alias("1 < b < 3")
    )
)

a,b,a == b,a <= b,a > 0,b > 0,b ≈ √2,1 < b < 3
f64,f64,bool,bool,bool,bool,bool,bool
-273.15,1.4142,False,True,False,True,True,True
0.0,2.7183,False,True,False,True,False,True
42.0,42.0,True,True,True,True,False,False
100.0,3.1415,False,False,True,True,False,False


In [17]:
pl.select(
    bool_num=pl.lit(True) > 0,
    time_time=pl.time(23, 58) > pl.time(0, 0),
    datetime_date=pl.datetime(1969, 7, 21, 2, 56) < pl.date(1976, 7, 20),
    str_num=pl.lit("5") < pl.lit(3).cast(pl.String), # <1>
    datetime_time=pl.datetime(1999, 1, 1).dt.time() != pl.time(0, 0), # <2>
).transpose(include_header=True,
            header_name="comparison",
            column_names=["allowed"])

comparison,allowed
str,bool
"""bool_num""",True
"""time_time""",True
"""datetime_date""",True
"""str_num""",False
"""datetime_time""",False


## Boolean Algebra Operations

In [19]:
x = 7
p = pl.lit(3) < pl.lit(x)  # True
q = pl.lit(x) < pl.lit(5)  # False
pl.select(p & q).item()

False

In [20]:
(
    pl.DataFrame({
        "p": [True, True, False, False],
        "q": [True, False, True, False]
    })
    .with_columns(
        (pl.col("p") & pl.col("q")).alias("p & q"),
        (pl.col("p") | pl.col("q")).alias("p | q"),
        (~pl.col("p")).alias("~p"),
        (pl.col("p") ^ pl.col("q")).alias("p ^ q"),
        (~(pl.col("p") & pl.col("q"))).alias("p ↑ q"),  # <1>
        ((pl.col("p").or_(pl.col("q"))).not_()).alias("p ↓ q")  # <2>
    )
)

p,q,p & q,p | q,~p,p ^ q,p ↑ q,p ↓ q
bool,bool,bool,bool,bool,bool,bool,bool
True,True,True,True,False,False,False,False
True,False,False,True,False,True,True,False
False,True,False,True,True,True,True,False
False,False,False,False,True,False,True,True


## Bitwise Operations

In [22]:
pl.select(pl.lit(10) | pl.lit(34)).item()

42

In [23]:
bits = (
    pl.DataFrame({
        "x": [1, 1, 0, 0, 7, 10],
        "y": [1, 0, 1, 0, 2, 34]
    }, schema={"x": pl.UInt8, "y": pl.UInt8})  # <1>
    .with_columns(
        (pl.col("x") & pl.col("y")).alias("x & y"),
        (pl.col("x") | pl.col("y")).alias("x | y"),
        (~pl.col("x")).alias("~x"),
        (pl.col("x") ^ pl.col("y")).alias("x ^ y"),
    )
)
bits

x,y,x & y,x | y,~x,x ^ y
u8,u8,u8,u8,u8,u8
1,1,1,1,254,0
1,0,0,1,254,1
0,1,0,1,255,1
0,0,0,0,255,0
7,2,2,7,248,5
10,34,2,42,245,40


In [24]:
bits.select(pl.all().map_elements("{0:08b}".format))



x,y,x & y,x | y,~x,x ^ y
str,str,str,str,str,str
"""00000001""","""00000001""","""00000001""","""00000001""","""11111110""","""00000000"""
"""00000001""","""00000000""","""00000000""","""00000001""","""11111110""","""00000001"""
"""00000000""","""00000001""","""00000000""","""00000001""","""11111111""","""00000001"""
"""00000000""","""00000000""","""00000000""","""00000000""","""11111111""","""00000000"""
"""00000111""","""00000010""","""00000010""","""00000111""","""11111000""","""00000101"""
"""00001010""","""00100010""","""00000010""","""00101010""","""11110101""","""00101000"""


## Using Functions

In [26]:
scientists = pl.DataFrame({
    'first_name': ['George', 'Grace', 'John', 'Kurt', 'Ada'],
    'last_name': ['Boole', 'Hopper', 'Tukey', 'Gödel', 'Lovelace'],
    'country': ['England', 'United States', 'United States',
    'Austria-Hungary', 'England']
})
scientists

first_name,last_name,country
str,str,str
"""George""","""Boole""","""England"""
"""Grace""","""Hopper""","""United States"""
"""John""","""Tukey""","""United States"""
"""Kurt""","""Gödel""","""Austria-Hungary"""
"""Ada""","""Lovelace""","""England"""


In [27]:
scientists.select(
    pl.concat_list(pl.col("^*_name$")).alias("concat_list"),
    pl.struct(pl.all()).alias("struct")
)

concat_list,struct
list[str],struct[3]
"[""George"", ""Boole""]","{""George"",""Boole"",""England""}"
"[""Grace"", ""Hopper""]","{""Grace"",""Hopper"",""United States""}"
"[""John"", ""Tukey""]","{""John"",""Tukey"",""United States""}"
"[""Kurt"", ""Gödel""]","{""Kurt"",""Gödel"",""Austria-Hungary""}"
"[""Ada"", ""Lovelace""]","{""Ada"",""Lovelace"",""England""}"


In [28]:
scientists.select(
    pl.concat_str(pl.all(), separator=" ").alias("concat_str"),
    pl.format("{}, {} from {}",
    "last_name", "first_name", "country").alias("format")
    )

concat_str,format
str,str
"""George Boole England""","""Boole, George from England"""
"""Grace Hopper United States""","""Hopper, Grace from United Stat…"
"""John Tukey United States""","""Tukey, John from United States"""
"""Kurt Gödel Austria-Hungary""","""Gödel, Kurt from Austria-Hunga…"
"""Ada Lovelace England""","""Lovelace, Ada from England"""


In [29]:
prefs = pl.DataFrame({
    "id": [1, 7, 42, 101, 999],
    "has_pet": [True, False, True, False, True],
    "likes_travel": [False, False, False, False, True],
    "likes_movies": [True, False, True, False, True],
    "likes_books": [False, False, True, True, True]
}).with_columns(
    pl.all_horizontal(pl.exclude("id")).alias("all"),
    pl.any_horizontal(pl.exclude("id")).alias("any"),
)
prefs

id,has_pet,likes_travel,likes_movies,likes_books,all,any
i64,bool,bool,bool,bool,bool,bool
1,True,False,True,False,False,True
7,False,False,False,False,False,False
42,True,False,True,True,False,True
101,False,False,False,True,False,True
999,True,True,True,True,True,True


In [30]:
prefs.select(
    pl.sum_horizontal(pl.all()).alias("sum"),
    pl.max_horizontal(pl.all()).alias("max"),
    pl.min_horizontal(pl.all()).alias("min"),
)

sum,max,min
i64,i64,i64
4,1,0
7,7,0
46,42,0
103,101,0
1005,999,1


In [31]:
prefs.select(
    pl.col("id"),
    pl.when(pl.all_horizontal(pl.col("^likes_.*$")))
    .then(pl.lit("Likes everything"))
    .when(pl.any_horizontal(pl.col("^likes_.*$")))
    .then(pl.lit("Likes something"))
    .otherwise(pl.lit("Likes nothing"))
    .alias("likes_what")
)

id,likes_what
i64,str
1,"""Likes something"""
7,"""Likes nothing"""
42,"""Likes something"""
101,"""Likes something"""
999,"""Likes everything"""


## Conclusion