In [1]:
!python --version
!which python

Python 3.12.4
/Users/wakala/venvs/versions/3.12.4/scanpy/bin/python


In [2]:
import polars as pl
pl.__version__

'1.21.0'

In [3]:
import datetime as dt

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1983, 3, 22),
            dt.date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [4]:
# df.write_csv('/Users/wakala/IdeaProjects/Projects/documents/polarsdoc/data/output.csv')
# df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True)
# df_csv.head()

FileNotFoundError: No such file or directory (os error 2): docs/assets/data/output.csv

In [8]:
df.with_columns(BMI=pl.col('weight') / pl.col('height') ** 2)

name,birthdate,weight,height,BMI
str,date,f64,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,23.141498
"""Chloe Cooper""",1983-03-22,53.6,1.65,19.687787
"""Daniel Donovan""",1981-04-30,83.1,1.75,27.134694


In [9]:
df.select(
    pl.col("name"),
    pl.col("birthdate").dt.year().alias("birth_year"),
    (pl.col("weight") / pl.col("height") ** 2).alias("bmi"),
)

name,birth_year,bmi
str,i32,f64
"""Alice Archer""",1997,23.791913
"""Ben Brown""",1985,23.141498
"""Chloe Cooper""",1983,19.687787
"""Daniel Donovan""",1981,27.134694


In [11]:
df.select(
    pl.col('name'),
    (pl.col('weight', 'height') * .95).round(2).name.suffix('-5%')
)

name,weight-5%,height-5%
str,f64,f64
"""Alice Archer""",55.01,1.48
"""Ben Brown""",68.88,1.68
"""Chloe Cooper""",50.92,1.57
"""Daniel Donovan""",78.94,1.66


In [12]:
df.with_columns(
    birth_year=pl.col('birthdate').dt.year(),
    bmi=(pl.col('weight') / pl.col('height') ** 2)
)

name,birthdate,weight,height,birth_year,bmi
str,date,f64,f64,i32,f64
"""Alice Archer""",1997-01-10,57.9,1.56,1997,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,1985,23.141498
"""Chloe Cooper""",1983-03-22,53.6,1.65,1983,19.687787
"""Daniel Donovan""",1981-04-30,83.1,1.75,1981,27.134694


In [13]:
df.with_columns(
    pl.col('birthdate').dt.year().alias('birth_year'),
    (pl.col('weight') / pl.col('height') ** 2).alias('bmi')
)

name,birthdate,weight,height,birth_year,bmi
str,date,f64,f64,i32,f64
"""Alice Archer""",1997-01-10,57.9,1.56,1997,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,1985,23.141498
"""Chloe Cooper""",1983-03-22,53.6,1.65,1983,19.687787
"""Daniel Donovan""",1981-04-30,83.1,1.75,1981,27.134694


In [15]:
df.filter(pl.col('birthdate').dt.year() < 1990)

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [None]:
df.filter(
    pl.col('birthdate').is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
    pl.col('height') > 1.7
)

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77


In [20]:
df.group_by(
    (pl.col('birthdate').dt.year() //  10 * 10).alias('decade'),
    maintain_order=True
).len()

decade,len
i32,u32
1990,1
1980,3


In [21]:
df.group_by(
    (pl.col('birthdate').dt.year() // 10 * 10).alias('decade'),
    maintain_order=True
).agg(
    pl.len().alias('sample_size'),
    pl.col('weight').mean().alias('avg_weight'),
    pl.col('height').mean().alias('avg_height'),
)

decade,sample_size,avg_weight,avg_height
i32,u32,f64,f64
1990,1,57.9,1.56
1980,3,69.733333,1.723333


In [33]:
df.with_columns(
    (pl.col('birthdate').dt.year() // 10 * 10).alias('decade'),
    pl.col('name').str.split(' ').list.first(),
).select(
    pl.all().exclude('birthdate')
).group_by(
    pl.col('decade'),
    maintain_order=True,
).agg(
    pl.col('name'),
    pl.col('weight', 'height').mean().round(2).name.prefix('avg_')
)

decade,name,avg_weight,avg_height
i32,list[str],f64,f64
1990,"[""Alice""]",57.9,1.56
1980,"[""Ben"", ""Chloe"", ""Daniel""]",69.73,1.72


In [34]:
df2 = pl.DataFrame(
    {
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
        "parent": [True, False, False, False],
        "siblings": [1, 2, 3, 4],
    }
)

df.join(df2, on="name", how="left")

name,birthdate,weight,height,parent,siblings
str,date,f64,f64,bool,i64
"""Alice Archer""",1997-01-10,57.9,1.56,False,3
"""Ben Brown""",1985-02-15,72.5,1.77,True,1
"""Chloe Cooper""",1983-03-22,53.6,1.65,False,4
"""Daniel Donovan""",1981-04-30,83.1,1.75,False,2


In [38]:
df2 = pl.DataFrame(
    {
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
        "parent": [True, False, False, False],
        "siblings": [1, 2, 3, 4],
    }
)

df.join(df2, on="name", how="left")

name,birthdate,weight,height,parent,siblings
str,date,f64,f64,bool,i64
"""Alice Archer""",1997-01-10,57.9,1.56,False,3
"""Ben Brown""",1985-02-15,72.5,1.77,True,1
"""Chloe Cooper""",1983-03-22,53.6,1.65,False,4
"""Daniel Donovan""",1981-04-30,83.1,1.75,False,2
