In [1]:
!pip install numpy

Collecting numpy
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 2.3 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.0.2
You should consider upgrading via the '/Users/sharath/projects/working-with-polars/.venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [6]:
import polars as pl
import numpy as np

np.random.seed(42)

df= pl.DataFrame(
    {
        "nrs": [1,2,3,None, 5],
        "names": ["foo","ham","spam","egg","spam"],
        "random": np.random.rand(5),
        "groups": ["A","A","B","A","B"]
    }
)
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.37454,"""A"""
2.0,"""ham""",0.950714,"""A"""
3.0,"""spam""",0.731994,"""B"""
,"""egg""",0.598658,"""A"""
5.0,"""spam""",0.156019,"""B"""


## Basic Arithmetic 

In [13]:
result = df.select(
    (pl.col("nrs") +5).alias("nrs + 5"),
    (pl.col("nrs") - 5).alias("nrs - 5"),
    (pl.col("nrs") * pl.col("random")).alias("nrs * random"),
    (pl.col("nrs") / pl.col("random")).alias("nrs / random"),
    (pl.col("nrs") **2).alias("nrs ** 5"),
    (pl.col("nrs") % 3).alias("nrs % 5"),
)
result

nrs + 5,nrs - 5,nrs * random,nrs / random,nrs ** 5,nrs % 5
i64,i64,f64,f64,i64,i64
6.0,-4.0,0.37454,2.669941,1.0,1.0
7.0,-3.0,1.901429,2.103681,4.0,2.0
8.0,-2.0,2.195982,4.098395,9.0,0.0
,,,,,
10.0,0.0,0.780093,32.047453,25.0,2.0


## comparisons

In [14]:
result = df.select(
    (pl.col("nrs") > 1).alias("nrs > 1"),  # .gt
    (pl.col("nrs") >= 3).alias("nrs >= 3"),  # ge
    (pl.col("random") < 0.2).alias("random < .2"),  # .lt
    (pl.col("random") <= 0.5).alias("random <= .5"),  # .le
    (pl.col("nrs") != 1).alias("nrs != 1"),  # .ne
    (pl.col("nrs") == 1).alias("nrs == 1"),  # .eq
)
result

nrs > 1,nrs >= 3,random < .2,random <= .5,nrs != 1,nrs == 1
bool,bool,bool,bool,bool,bool
False,False,False,True,False,True
True,False,False,False,True,False
True,True,False,False,True,False
,,False,False,,
True,True,True,True,True,False


## Boolean Operators

In [15]:
# Corresponding named functions `and_`, `or_`, and `not_`.
result2 = df.select(
    (pl.col("nrs").is_null().not_().and_(pl.col("groups") == "A")).alias(
        "number not null and group A"
    ),
    ((pl.col("random") < 0.5).or_(pl.col("groups") == "B")).alias(
        "random < 0.5 or group B"
    ),
)
result2

number not null and group A,random < 0.5 or group B
bool,bool
True,True
True,False
False,True
False,False
False,True


## Count unique values

In [20]:
long_df = pl.DataFrame({"numbers": np.random.randint(0, 100_000, 100_000)})
result = long_df.select(
    pl.col("numbers").n_unique().alias("n_unique"),
    pl.col("numbers").approx_n_unique().alias("approx_n_unique"),
)

result

n_unique,approx_n_unique
u32,u32
63152,62918


In [21]:
# approx estimation is off by a margin of 0.9

In [23]:
result = df.select(
    pl.col("names").value_counts().alias("value_counts"),
)
result

value_counts
struct[2]
"{""spam"",2}"
"{""ham"",1}"
"{""egg"",1}"
"{""foo"",1}"


In [26]:
result = df.select(
    pl.col("names").unique(maintain_order=True).alias("unique"),
    pl.col("names").unique_counts().alias("unique_counts"),
)

print(result)

shape: (4, 2)
┌────────┬───────────────┐
│ unique ┆ unique_counts │
│ ---    ┆ ---           │
│ str    ┆ u32           │
╞════════╪═══════════════╡
│ foo    ┆ 1             │
│ ham    ┆ 1             │
│ spam   ┆ 2             │
│ egg    ┆ 1             │
└────────┴───────────────┘


## conditionals

In [27]:
result = df.select(
    pl.col("nrs"),
    pl.when(pl.col("nrs") % 2 == 1)  # Is the number odd?
    .then(3 * pl.col("nrs") + 1)  # If so, multiply by 3 and add 1.
    .otherwise(pl.col("nrs") // 2)  # If not, divide by 2.
    .alias("Collatz"),
)

result

nrs,Collatz
i64,i64
1.0,4.0
2.0,1.0
3.0,10.0
,
5.0,16.0
