This Jupyter Notebook contains the code used in the [RealPython]()https://www.realpython.com) tutorial [**Aggregating and Grouping Data in Polars**](https://realpython.com/aggregating-and-grouping-data-in-polars-groupby/)

# Aggregating Data

In [1]:
import polars as pl

In [2]:
math_students = pl.read_parquet("math.parquet")

(math_students.select(pl.col("absences").max()))

absences
i64
75


In [3]:
math_students = pl.read_parquet("math.parquet")

(
    math_students.select(pl.col("student_id", "absences")).filter(
        pl.col("absences") == pl.col("absences").max()
    )
)

student_id,absences
i64,i64
10277,75


In [3]:
math_students = pl.read_parquet("math.parquet")

(
    math_students.select(
        min=pl.col("absences").min(),
        max=pl.col("absences").max(),
        mean=pl.col("absences").mean(),
    )
)

min,max,mean
i64,i64,f64
0,75,5.708861


In [4]:
math_students = pl.read_parquet("math.parquet")

math_students.select(
    "student_id",
    "G1",
    "G2",
    "G3",
    total=pl.sum_horizontal("G1", "G2", "G3"),
    mean=pl.mean_horizontal("G1", "G2", "G3"),
)

student_id,G1,G2,G3,total,mean
i64,i64,i64,i64,i64,f64
10001,5,6,6,17,5.666667
10002,5,5,6,16,5.333333
10003,7,8,10,25,8.333333
10004,15,14,15,44,14.666667
10005,6,10,10,26,8.666667
…,…,…,…,…,…
10391,9,9,9,27,9.0
10392,14,16,16,46,15.333333
10393,10,8,7,25,8.333333
10394,11,12,10,33,11.0


# Grouping Data for Aggregation With .groupby()
## Grouping Data - Basic Principles

In [8]:
import polars as pl

math_students = pl.read_parquet("math.parquet")

(
    math_students.select(
        min=pl.col("absences").min(),
        max=pl.col("absences").max(),
        mean=pl.col("absences").mean(),
    )
)

min,max,mean
i64,i64,f64
0,75,5.708861


In [9]:
portuguese_students = pl.read_parquet("portuguese.parquet")

(
    portuguese_students.select(
        min=pl.col("absences").min(),
        max=pl.col("absences").max(),
        mean=pl.col("absences").mean(),
    )
)

min,max,mean
i64,i64,f64
0,32,3.659476


In [10]:
all_students = pl.read_parquet("course.parquet")

(
    all_students.group_by("subject").agg(
        min=pl.col("absences").min(),
        max=pl.col("absences").max(),
        mean=pl.col("absences").mean(),
    )
)

subject,min,max,mean
str,i64,i64,f64
"""M""",0,75,5.708861
"""P""",0,32,3.659476


## Grouping Data - The Power of Expressions

In [13]:
math_students = pl.read_parquet("math.parquet")

(
    math_students.group_by("age").agg(
        passes=pl.col("G3"),
    )
)

age,passes
i64,list[i64]
18,"[6, 0, … 10]"
20,"[18, 15, 9]"
15,"[10, 15, … 7]"
17,"[6, 6, … 16]"
16,"[10, 15, … 8]"
19,"[9, 0, … 9]"
22,[8]
21,[7]


In [6]:
math_students = pl.read_parquet("math.parquet")

(
    math_students.group_by("age").agg(
        passes=pl.col("G3").filter(
            pl.col("absences") > pl.col("absences").mean(), pl.col("G3") >= 13
        )
    )
)

age,passes
i64,list[i64]
16,"[15, 14, … 18]"
19,"[13, 13, … 13]"
21,[]
18,"[14, 18, … 13]"
20,[]
22,[]
15,"[15, 13, … 15]"
17,"[13, 18, … 15]"


In [5]:
math_students = pl.read_parquet("math.parquet")

(
    math_students.group_by("age")
    .agg(
        passes=pl.col("G3")
        .filter(
            pl.col("absences") > pl.col("absences").mean(), pl.col("G3") >= 13
        )
        .count(),
        poor_attenders=pl.col("G3")
        .filter(pl.col("absences") > pl.col("absences").mean())
        .count(),
    )
    .select(
        pl.col("age", "passes", "poor_attenders"),
        percentage=pl.col("passes") / pl.col("poor_attenders") * 100,
    )
    .with_columns(pl.col("percentage").replace(float("NaN"), 0))
    .sort("age")
)

age,passes,poor_attenders,percentage
i64,u32,u32,f64
15,15,32,46.875
16,11,39,28.205128
17,8,29,27.586207
18,11,31,35.483871
19,4,10,40.0
20,0,1,0.0
21,0,0,0.0
22,0,0,0.0


# Grouping and Aggregating by Multiple Columns

In [17]:
import polars as pl

all_students = pl.read_parquet("course.parquet")

(
    all_students.group_by(["subject", "reason"])
    .agg(
        min=pl.col("absences").min(),
        max=pl.col("absences").max(),
        mean=pl.col("absences").mean(),
    )
    .sort("subject")
)

subject,reason,min,max,mean
str,str,i64,i64,f64
"""M""","""reputation""",0,56,6.647619
"""M""","""course""",0,23,3.972414
"""M""","""other""",0,20,5.611111
"""M""","""home""",0,75,7.146789
"""P""","""home""",0,30,4.456376
"""P""","""other""",0,16,2.777778
"""P""","""reputation""",0,32,3.811189
"""P""","""course""",0,26,3.389474


In [16]:
import polars as pl

all_students = pl.read_parquet("course.parquet")

(
    all_students.group_by(["subject", "reason"])
    .agg(
        min=pl.col("absences").min(),
        max=pl.col("absences").max(),
        mean=pl.col("absences").mean(),
    )
    .sort("subject", "reason")
)

subject,reason,min,max,mean
str,str,i64,i64,f64
"""M""","""course""",0,23,3.972414
"""M""","""home""",0,75,7.146789
"""M""","""other""",0,20,5.611111
"""M""","""reputation""",0,56,6.647619
"""P""","""course""",0,26,3.389474
"""P""","""home""",0,30,4.456376
"""P""","""other""",0,16,2.777778
"""P""","""reputation""",0,32,3.811189


In [15]:
import polars as pl

all_students = pl.read_parquet("course.parquet")

(
    all_students.group_by(["subject", "reason"])
    .agg(
        min=pl.col("absences").min(),
        max=pl.col("absences").max(),
        mean=pl.col("absences").mean(),
    )
    .sort("reason", descending=True)
    .sort("subject")
)

subject,reason,min,max,mean
str,str,i64,i64,f64
"""M""","""reputation""",0,56,6.647619
"""M""","""other""",0,20,5.611111
"""M""","""home""",0,75,7.146789
"""M""","""course""",0,23,3.972414
"""P""","""reputation""",0,32,3.811189
"""P""","""other""",0,16,2.777778
"""P""","""home""",0,30,4.456376
"""P""","""course""",0,26,3.389474


# Grouping and Aggregating Time Series With .group_by_dynamic()

In [19]:
import polars as pl

math_attendance = pl.read_parquet("math_classes.parquet")
math_attendance.head()

class_start,class_subject,absences,lecturer_initials
datetime[μs],str,i64,str
2024-01-03 09:00:00,"""algebra""",3,"""DH"""
2024-01-04 13:30:00,"""geometry""",4,"""PS"""
2024-01-05 10:00:00,"""calculus""",3,"""LM"""
2024-01-10 09:00:00,"""algebra""",2,"""DH"""
2024-01-11 13:30:00,"""geometry""",7,"""PS"""


In [20]:
math_attendance = pl.read_parquet("math_classes.parquet")

(
    math_attendance.group_by_dynamic(
        index_column="class_start", every="1w", closed="both"
    ).agg(
        total_absences=pl.col("absences").sum(),
        mean_absences=pl.col("absences").mean(),
    )
)

class_start,total_absences,mean_absences
datetime[μs],i64,f64
2024-01-01 00:00:00,10,3.333333
2024-01-08 00:00:00,10,3.333333
2024-01-15 00:00:00,11,3.666667
2024-01-22 00:00:00,10,3.333333
2024-01-29 00:00:00,4,1.333333
…,…,…
2024-05-27 00:00:00,15,5.0
2024-06-03 00:00:00,12,4.0
2024-06-10 00:00:00,17,5.666667
2024-06-17 00:00:00,6,2.0


In [21]:
math_attendance = pl.read_parquet("math_classes.parquet")

(
    math_attendance.group_by_dynamic(
        index_column="class_start",
        every="1q",
        closed="both",
        group_by="class_subject",
    ).agg(pl.col("absences").sum())
)

class_subject,class_start,absences
str,datetime[μs],i64
"""algebra""",2024-01-01 00:00:00,56
"""algebra""",2024-04-01 00:00:00,44
"""geometry""",2024-01-01 00:00:00,35
"""geometry""",2024-04-01 00:00:00,40
"""calculus""",2024-01-01 00:00:00,41
"""calculus""",2024-04-01 00:00:00,39


# Grouping and Aggregating Using Window Functions

In [6]:
all_students = pl.read_parquet("course.parquet")

all_students.select(
    pl.col("subject", "school", "student_id", "reason", "absences"),
    mean_absences=(
        pl.col("absences").mean().over("subject", "school", "reason")
    ),
)

subject,school,student_id,reason,absences,mean_absences
str,str,i64,str,i64,f64
"""M""","""GP""",10001,"""course""",6,4.104839
"""M""","""GP""",10002,"""course""",4,4.104839
"""M""","""GP""",10003,"""other""",10,6.518519
"""M""","""GP""",10004,"""home""",2,7.397959
"""M""","""GP""",10005,"""home""",4,7.397959
…,…,…,…,…,…
"""P""","""MS""",10645,"""course""",4,2.627119
"""P""","""MS""",10646,"""course""",4,2.627119
"""P""","""MS""",10647,"""course""",6,2.627119
"""P""","""MS""",10648,"""course""",6,2.627119


In [47]:
all_students = pl.read_parquet("course.parquet")

all_students.select(
    pl.col("subject", "school", "student_id", "reason", "absences"),
    mean_absences=(
        pl.col("absences").mean().over("subject", "school", "reason")
    ),
).filter(pl.col("absences") > pl.col("mean_absences"))

subject,school,student_id,reason,absences,mean_absences
str,str,i64,str,i64,f64
"""M""","""GP""",10001,"""course""",6,4.104839
"""M""","""GP""",10003,"""other""",10,6.518519
"""M""","""GP""",10006,"""reputation""",10,6.72
"""M""","""GP""",10019,"""course""",16,4.104839
"""M""","""GP""",10026,"""home""",14,7.397959
…,…,…,…,…,…
"""P""","""MS""",10645,"""course""",4,2.627119
"""P""","""MS""",10646,"""course""",4,2.627119
"""P""","""MS""",10647,"""course""",6,2.627119
"""P""","""MS""",10648,"""course""",6,2.627119


# Grouping and Aggregating Using Pivot Tables

In [25]:
import polars as pl

all_students = pl.read_parquet("course.parquet")

(
    all_students.group_by(["subject", "school"]).agg(
        mean_absence=pl.col("absences").mean(),
        mean_failure=pl.col("failures").mean(),
    )
)

subject,school,mean_absence,mean_failure
str,str,f64,f64
"""P""","""GP""",4.21513,0.172577
"""P""","""MS""",2.619469,0.314159
"""M""","""MS""",3.76087,0.456522
"""M""","""GP""",5.965616,0.318052


In [17]:
all_students = pl.read_parquet("course.parquet")

(
    all_students.pivot(
        on="school",
        index="subject",
        values=["absences", "failures"],
        aggregate_function="mean",
    )
)

subject,absences_GP,absences_MS,failures_GP,failures_MS
str,f64,f64,f64,f64
"""M""",5.965616,3.76087,0.318052,0.456522
"""P""",4.21513,2.619469,0.172577,0.314159


In [18]:
all_students = pl.read_parquet("course.parquet")

(
    all_students.pivot(
        on="school",
        index="subject",
        values=["absences", "failures"],
        aggregate_function="mean",
    ).select(
        pl.col(
            "subject",
            "absences_GP",
            "failures_GP",
            "absences_MS",
            "failures_MS",
        ),
    )
)

subject,absences_GP,failures_GP,absences_MS,failures_MS
str,f64,f64,f64,f64
"""M""",5.965616,0.318052,3.76087,0.456522
"""P""",4.21513,0.172577,2.619469,0.314159


In [16]:
all_students = pl.read_parquet("course.parquet")
(
    all_students.pivot(
        on="school",
        index=["subject", "reason"],
        values=["absences", "failures"],
        aggregate_function="mean",
    ).select(
        pl.col(
            "subject",
            "reason",
            "absences_GP",
            "failures_GP",
            "absences_MS",
            "failures_MS",
        ),
    )
)

subject,reason,absences_GP,failures_GP,absences_MS,failures_MS
str,str,f64,f64,f64,f64
"""M""","""course""",4.104839,0.33871,3.190476,0.52381
"""M""","""other""",6.518519,0.333333,2.888889,0.222222
"""M""","""home""",7.397959,0.357143,4.909091,0.636364
"""M""","""reputation""",6.72,0.25,5.2,0.2
"""P""","""course""",3.928144,0.245509,2.627119,0.449153
"""P""","""other""",3.407407,0.185185,2.4,0.244444
"""P""","""home""",4.869565,0.130435,3.058824,0.147059
"""P""","""reputation""",4.166667,0.105263,2.413793,0.068966
